From e47252f3c6e95bf009402d9cddec81b11df6a89b Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Wed, 8 Apr 2026 17:08:30 -0400 Subject: [PATCH 01/70] Text Sizing Protocol (WIP/DRAFT!) --- docs/specs.rst | 51 ++++++ tests/test_osc66.py | 310 ++++++++++++++++++++++++++++++++++++ wcwidth/__init__.py | 8 +- wcwidth/escape_sequences.py | 7 + wcwidth/osc66.py | 280 ++++++++++++++++++++++++++++++++ wcwidth/table_grapheme.py | 48 +++--- wcwidth/table_mc.py | 8 +- wcwidth/table_wide.py | 20 +-- wcwidth/table_zero.py | 18 +-- wcwidth/wcwidth.py | 68 ++++++-- 10 files changed, 760 insertions(+), 58 deletions(-) create mode 100644 tests/test_osc66.py create mode 100644 wcwidth/osc66.py diff --git a/docs/specs.rst b/docs/specs.rst index ba1d885..ef4ef31 100644 --- a/docs/specs.rst +++ b/docs/specs.rst @@ -119,6 +119,57 @@ formation: the font engine merges the consonants into a single ligature glyph. See also: `L2/2023/23107`_ "Proper Complex Script Support in Text Terminals". +OSC 66 (Kitty Text Sizing Protocol) +------------------------------------ + +The `Kitty Text Sizing Protocol`_ (OSC 66) allows applications to explicitly +declare how many terminal cells text occupies, using the escape sequence:: + + ESC ] 66 ; metadata ; text BEL/ST + +Where ``metadata`` is colon-separated ``key=value`` pairs and the terminator +is either BEL (``0x07``) or ST (``ESC \``). + +Metadata parameters: + +- ``s``: Scale factor (1--7, default 1). Text occupies ``s`` rows tall and + ``s * w`` columns wide. +- ``w``: Width in cells (0--7, default 0). When 0, width is auto-calculated + from the inner text. +- ``n``: Fractional scaling numerator (0--15, default 0). +- ``d``: Fractional scaling denominator (0--15, default 0). +- ``v``: Vertical alignment (0=top, 1=bottom, 2=center; default 0). +- ``h``: Horizontal alignment (0=left, 1=right, 2=center; default 0). + +Width calculation by :func:`wcwidth.width`: + +- When ``w > 0``: the sequence occupies exactly ``s * w`` cells, regardless + of the inner text content. +- When ``w == 0``: the sequence occupies ``s * inner_text_width`` cells, where + ``inner_text_width`` is the measured width of the text payload. + +The fractional scaling parameters (``n`` and ``d``) adjust the rendered font +size within the allocated cells but do not change the cell count. + +OSC 66 sequences are handled in all ``control_codes`` modes (``'parse'``, +``'strict'``, ``'ignore'``), since they declare explicit width rather than +causing indeterminate cursor movement. + +:func:`wcwidth.strip_sequences` extracts the inner text payload from OSC 66 +sequences while stripping the escape wrapper. + +:func:`wcwidth.clip` treats each OSC 66 sequence as an atomic unit of its +declared width. If the sequence straddles a clip boundary, it is replaced +with fill characters. + +Generation functions :func:`wcwidth.osc66_wrap` and :func:`wcwidth.osc66_scale` +produce correctly formatted OSC 66 sequences. The text payload is limited to +4096 bytes (UTF-8 encoded) per the protocol specification. + +See also: `Kitty Text Sizing Protocol`_. + +.. _`Kitty Text Sizing Protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ + .. _`U+0000`: https://codepoints.net/U+0000 .. _`U+0001`: https://codepoints.net/U+0001 .. _`U+001F`: https://codepoints.net/U+001F diff --git a/tests/test_osc66.py b/tests/test_osc66.py new file mode 100644 index 0000000..1ba1128 --- /dev/null +++ b/tests/test_osc66.py @@ -0,0 +1,310 @@ +"""Tests for OSC 66 (Kitty Text Sizing Protocol) support.""" +# 3rd party +import pytest + +# local +import wcwidth +from wcwidth.osc66 import ( + OSC66Metadata, + parse_osc66_metadata, + make_osc66_metadata, + parse_osc66_sequence, + osc66_width, + make_osc66_sequence, + osc66_wrap, + osc66_scale, + _replace_osc66_with_padding, +) + + +PARSE_METADATA_CASES = [ + ('', OSC66Metadata()), + ('s=2', OSC66Metadata(scale=2)), + ('w=3', OSC66Metadata(width=3)), + ('s=2:w=3', OSC66Metadata(scale=2, width=3)), + ('s=2:w=3:n=1:d=2:v=1:h=2', + OSC66Metadata(scale=2, width=3, numerator=1, denominator=2, + vertical_align=1, horizontal_align=2)), + ('n=5:d=10', OSC66Metadata(numerator=5, denominator=10)), + ('v=0:h=0', OSC66Metadata()), + ('s=1:w=0', OSC66Metadata()), +] + + +@pytest.mark.parametrize('raw,expected', PARSE_METADATA_CASES) +def test_parse_osc66_metadata(raw, expected): + assert parse_osc66_metadata(raw) == expected + + +PARSE_METADATA_CLAMP_CASES = [ + ('s=0', OSC66Metadata(scale=1)), + ('s=9', OSC66Metadata(scale=7)), + ('w=8', OSC66Metadata(width=7)), + ('n=20', OSC66Metadata(numerator=15)), + ('d=99', OSC66Metadata(denominator=15)), + ('v=5', OSC66Metadata(vertical_align=2)), + ('h=3', OSC66Metadata(horizontal_align=2)), + ('w=-1', OSC66Metadata(width=0)), +] + + +@pytest.mark.parametrize('raw,expected', PARSE_METADATA_CLAMP_CASES) +def test_parse_osc66_metadata_clamp(raw, expected): + assert parse_osc66_metadata(raw) == expected + + +PARSE_METADATA_EDGE_CASES = [ + ('unknown=5', OSC66Metadata()), + ('s=2:unknown=5:w=3', OSC66Metadata(scale=2, width=3)), + ('s=abc', OSC66Metadata()), + ('s=', OSC66Metadata()), + ('noequalssign', OSC66Metadata()), + ('s=2:w=3:', OSC66Metadata(scale=2, width=3)), + (':s=2', OSC66Metadata(scale=2)), +] + + +@pytest.mark.parametrize('raw,expected', PARSE_METADATA_EDGE_CASES) +def test_parse_osc66_metadata_edge(raw, expected): + assert parse_osc66_metadata(raw) == expected + + +MAKE_METADATA_CASES = [ + (OSC66Metadata(), ''), + (OSC66Metadata(scale=2), 's=2'), + (OSC66Metadata(width=3), 'w=3'), + (OSC66Metadata(scale=2, width=3), 's=2:w=3'), + (OSC66Metadata(scale=2, width=3, numerator=1, denominator=2, + vertical_align=1, horizontal_align=2), + 's=2:w=3:n=1:d=2:v=1:h=2'), +] + + +@pytest.mark.parametrize('meta,expected', MAKE_METADATA_CASES) +def test_make_osc66_metadata(meta, expected): + assert make_osc66_metadata(meta) == expected + + +METADATA_ROUNDTRIP_CASES = [ + OSC66Metadata(), + OSC66Metadata(scale=3), + OSC66Metadata(scale=2, width=5), + OSC66Metadata(scale=7, width=7, numerator=15, denominator=15, + vertical_align=2, horizontal_align=2), + OSC66Metadata(numerator=1, denominator=2), +] + + +@pytest.mark.parametrize('meta', METADATA_ROUNDTRIP_CASES) +def test_metadata_roundtrip(meta): + assert parse_osc66_metadata(make_osc66_metadata(meta)) == meta + + +PARSE_SEQUENCE_CASES = [ + ('\x1b]66;s=2;hello\x07', + (OSC66Metadata(scale=2), 'hello', '\x07')), + ('\x1b]66;s=2;hello\x1b\\', + (OSC66Metadata(scale=2), 'hello', '\x1b\\')), + ('\x1b]66;;text\x07', + (OSC66Metadata(), 'text', '\x07')), + ('\x1b]66;s=3:w=2;\x07', + (OSC66Metadata(scale=3, width=2), '', '\x07')), + ('\x1b]66;w=5;AB\x07', + (OSC66Metadata(width=5), 'AB', '\x07')), +] + + +@pytest.mark.parametrize('seq,expected', PARSE_SEQUENCE_CASES) +def test_parse_osc66_sequence(seq, expected): + assert parse_osc66_sequence(seq) == expected + + +PARSE_SEQUENCE_NONE_CASES = [ + '\x1b[31m', + '\x1b]0;title\x07', + '\x1b]65;s=2;text\x07', + 'plain text', + '', + '\x1b]66;missing_second_semi\x07', +] + + +@pytest.mark.parametrize('seq', PARSE_SEQUENCE_NONE_CASES) +def test_parse_osc66_sequence_none(seq): + assert parse_osc66_sequence(seq) is None + + +OSC66_WIDTH_CASES = [ + (OSC66Metadata(scale=2, width=3), 'anything', 6), + (OSC66Metadata(scale=1, width=5), '', 5), + (OSC66Metadata(scale=3, width=1), 'x', 3), + (OSC66Metadata(scale=1, width=0), 'AB', 2), + (OSC66Metadata(scale=2, width=0), 'AB', 4), + (OSC66Metadata(scale=1, width=0), '\u4e2d', 2), + (OSC66Metadata(scale=2, width=0), '\u4e2d', 4), + (OSC66Metadata(scale=1, width=0), '', 0), + (OSC66Metadata(scale=3, width=0), '', 0), +] + + +@pytest.mark.parametrize('meta,inner,expected', OSC66_WIDTH_CASES) +def test_osc66_width(meta, inner, expected): + assert osc66_width(meta, inner) == expected + + +MAKE_SEQUENCE_CASES = [ + ('hi', OSC66Metadata(scale=2, width=1), '\x07', + '\x1b]66;s=2:w=1;hi\x07'), + ('AB', OSC66Metadata(scale=2, width=2), '\x1b\\', + '\x1b]66;s=2:w=2;AB\x1b\\'), + ('x', OSC66Metadata(), '\x07', + '\x1b]66;;x\x07'), + ('', OSC66Metadata(scale=3, width=2), '\x07', + '\x1b]66;s=3:w=2;\x07'), +] + + +@pytest.mark.parametrize('text,meta,term,expected', MAKE_SEQUENCE_CASES) +def test_make_osc66_sequence(text, meta, term, expected): + assert make_osc66_sequence(text, meta, term) == expected + + +def test_make_osc66_sequence_payload_limit(): + text = 'x' * 4097 + with pytest.raises(ValueError, match='4096'): + make_osc66_sequence(text, OSC66Metadata()) + + +WRAP_CASES = [ + (dict(text='AB', scale=2, width=2), + '\x1b]66;s=2:w=2;AB\x07'), + (dict(text='AB', scale=2, width=2, terminator='\x1b\\'), + '\x1b]66;s=2:w=2;AB\x1b\\'), + (dict(text='x', scale=1), + '\x1b]66;;x\x07'), + (dict(text='hi', scale=3, width=1, numerator=1, denominator=2, + vertical_align=1, horizontal_align=2), + '\x1b]66;s=3:w=1:n=1:d=2:v=1:h=2;hi\x07'), +] + + +@pytest.mark.parametrize('kwargs,expected', WRAP_CASES) +def test_osc66_wrap(kwargs, expected): + assert osc66_wrap(**kwargs) == expected + + +SCALE_CASES = [ + ('AB', 2, '\x1b]66;s=2:w=2;AB\x07'), + ('\u4e2d', 2, '\x1b]66;s=2:w=2;\u4e2d\x07'), + ('x', 3, '\x1b]66;s=3:w=1;x\x07'), + ('hello', 1, '\x1b]66;w=5;hello\x07'), +] + + +@pytest.mark.parametrize('text,scale,expected', SCALE_CASES) +def test_osc66_scale(text, scale, expected): + assert osc66_scale(text, scale) == expected + + +def test_osc66_scale_st_terminator(): + result = osc66_scale('AB', 2, terminator='\x1b\\') + assert result == '\x1b]66;s=2:w=2;AB\x1b\\' + + +# --- Integration tests: width() --- + +WIDTH_PARSE_CASES = [ + ('\x1b]66;s=2:w=3;anything\x07', 6), + ('\x1b]66;w=3;x\x07', 3), + ('\x1b]66;s=1:w=0;AB\x07', 2), + ('\x1b]66;s=2:w=0;AB\x07', 4), + ('\x1b]66;s=2:w=0;\u4e2d\x07', 4), + ('\x1b]66;s=1:w=0;\x07', 0), + ('abc\x1b]66;w=3;x\x07def', 9), + ('\x1b]66;w=2;A\x07\x1b]66;w=3;B\x07', 5), + ('\x1b]66;s=2:w=3;text\x1b\\', 6), + ('\x1b[31m\x1b]66;w=2;AB\x07\x1b[0m', 2), +] + + +@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) +def test_width_osc66_parse(text, expected): + assert wcwidth.width(text) == expected + + +@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) +def test_width_osc66_ignore(text, expected): + assert wcwidth.width(text, control_codes='ignore') == expected + + +@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) +def test_width_osc66_strict(text, expected): + assert wcwidth.width(text, control_codes='strict') == expected + + +# --- Integration tests: strip_sequences() --- + +STRIP_OSC66_CASES = [ + ('\x1b]66;s=2;hello\x07', 'hello'), + ('\x1b]66;s=2;hello\x1b\\', 'hello'), + ('\x1b]66;;text\x07', 'text'), + ('\x1b]66;s=3:w=2;\x07', ''), + ('abc\x1b]66;w=2;XY\x07def', 'abcXYdef'), + ('\x1b[31m\x1b]66;s=2;red\x07\x1b[0m', 'red'), + ('\x1b]66;w=1;A\x07\x1b]66;w=1;B\x07', 'AB'), +] + + +@pytest.mark.parametrize('text,expected', STRIP_OSC66_CASES) +def test_strip_sequences_osc66(text, expected): + assert wcwidth.strip_sequences(text) == expected + + +# --- Integration tests: iter_sequences() --- + +def test_iter_sequences_osc66(): + text = 'abc\x1b]66;s=2;hello\x07def' + segments = list(wcwidth.iter_sequences(text)) + assert segments == [ + ('abc', False), + ('\x1b]66;s=2;hello\x07', True), + ('def', False), + ] + + +def test_iter_sequences_osc66_st(): + text = '\x1b]66;w=2;AB\x1b\\' + segments = list(wcwidth.iter_sequences(text)) + assert segments == [('\x1b]66;w=2;AB\x1b\\', True)] + + +# --- Integration tests: clip() --- + +CLIP_OSC66_CASES = [ + ('\x1b]66;w=3;ABC\x07', 0, 3, '\x1b]66;w=3;ABC\x07'), + ('\x1b]66;w=3;ABC\x07', 0, 2, ' '), + ('\x1b]66;w=3;ABC\x07', 1, 3, ' '), + ('ab\x1b]66;w=2;XY\x07cd', 0, 6, 'ab\x1b]66;w=2;XY\x07cd'), + ('ab\x1b]66;w=2;XY\x07cd', 0, 3, 'ab '), + ('ab\x1b]66;w=2;XY\x07cd', 4, 6, 'cd'), +] + + +@pytest.mark.parametrize('text,start,end,expected', CLIP_OSC66_CASES) +def test_clip_osc66(text, start, end, expected): + assert wcwidth.clip(text, start, end) == expected + + +# --- Internal helper --- + +REPLACE_PADDING_CASES = [ + ('\x1b]66;w=3;x\x07', ' '), + ('\x1b]66;s=2:w=2;AB\x07', ' '), + ('abc\x1b]66;w=1;x\x07def', 'abc def'), + ('no osc66 here', 'no osc66 here'), +] + + +@pytest.mark.parametrize('text,expected', REPLACE_PADDING_CASES) +def test_replace_osc66_with_padding(text, expected): + assert _replace_osc66_with_padding(text) == expected diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 400c8a6..c24bbe7 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -30,13 +30,19 @@ from .grapheme import iter_graphemes, iter_graphemes_reverse from .textwrap import SequenceTextWrapper, wrap from .sgr_state import propagate_sgr +from .osc66 import (OSC66Metadata, + parse_osc66_sequence, + osc66_wrap, + osc66_scale) # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". __all__ = ('wcwidth', 'wcswidth', 'width', 'iter_sequences', 'iter_graphemes', 'iter_graphemes_reverse', 'grapheme_boundary_before', 'ljust', 'rjust', 'center', 'wrap', 'clip', 'strip_sequences', - 'list_versions', 'propagate_sgr') + 'list_versions', 'propagate_sgr', + 'OSC66Metadata', 'parse_osc66_sequence', 'osc66_wrap', + 'osc66_scale') # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index d4ac6cc..9478694 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -8,6 +8,13 @@ # std imports import re +# OSC 66 (Kitty Text Sizing Protocol) — has positive width, must be checked before ZERO_WIDTH_PATTERN. +# Groups: (1) metadata, (2) inner text, (3) terminator (BEL or ST). +# https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ +OSC66_PATTERN = re.compile( + r'\x1b\]66;([^;\x07\x1b]*);([^\x07\x1b]*)(\x07|\x1b\\)' +) + # Zero-width escape sequences (SGR, OSC, CSI, etc.). This table, like INDETERMINATE_EFFECT_SEQUENCE, # originated from the 'blessed' library. ZERO_WIDTH_PATTERN = re.compile( diff --git a/wcwidth/osc66.py b/wcwidth/osc66.py new file mode 100644 index 0000000..8d120eb --- /dev/null +++ b/wcwidth/osc66.py @@ -0,0 +1,280 @@ +r""" +OSC 66 (Kitty Text Sizing Protocol) parsing and generation. + +The `Kitty Text Sizing Protocol`_ allows applications to explicitly tell +terminals how many cells text occupies, using the escape sequence:: + + ESC ] 66 ; metadata ; text BEL/ST + +Metadata is colon-separated ``key=value`` pairs: + +- ``s``: scale (1--7, default 1) +- ``w``: width in cells (0--7, default 0; 0 means auto-calculate from inner text) +- ``n``: fractional numerator (0--15, default 0) +- ``d``: fractional denominator (0--15, default 0) +- ``v``: vertical alignment (0--2, default 0: top, 1: bottom, 2: center) +- ``h``: horizontal alignment (0--2, default 0: left, 1: right, 2: center) + +Width calculation: if ``w > 0``, the sequence occupies ``s * w`` cells. +If ``w == 0``, the sequence occupies ``s * inner_text_width`` cells. + +.. _`Kitty Text Sizing Protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ + +.. versionadded:: 0.6.0 +""" +from __future__ import annotations + +from typing import NamedTuple + +from .escape_sequences import OSC66_PATTERN + +_MAX_TEXT_PAYLOAD = 4096 + +# Metadata key → (NamedTuple field, min, max, default) +_META_FIELDS = { + 's': ('scale', 1, 7, 1), + 'w': ('width', 0, 7, 0), + 'n': ('numerator', 0, 15, 0), + 'd': ('denominator', 0, 15, 0), + 'v': ('vertical_align', 0, 2, 0), + 'h': ('horizontal_align', 0, 2, 0), +} + +# Reverse map: field name → short key +_FIELD_TO_KEY = {field: key for key, (field, _, _, _) in _META_FIELDS.items()} + + +class OSC66Metadata(NamedTuple): + """Parsed metadata from an OSC 66 escape sequence. + + :param scale: Scale factor (1--7). Text occupies ``scale`` rows tall + and ``scale * width`` columns wide. + :param width: Width in cells (0--7). When 0, width is auto-calculated + from the inner text. + :param numerator: Fractional scaling numerator (0--15). + :param denominator: Fractional scaling denominator (0--15). + :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). + :param horizontal_align: Horizontal alignment (0=left, 1=right, 2=center). + """ + + scale: int = 1 + width: int = 0 + numerator: int = 0 + denominator: int = 0 + vertical_align: int = 0 + horizontal_align: int = 0 + + +def parse_osc66_metadata(raw: str) -> OSC66Metadata: + """Parse colon-separated ``key=value`` metadata string. + + :param raw: Metadata string, e.g. ``'s=2:w=3'``. + :returns: Parsed metadata with values clamped to valid ranges. + Unknown keys are ignored. Non-integer values use defaults. + + Example:: + + >>> parse_osc66_metadata('s=2:w=3') + OSC66Metadata(scale=2, width=3, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) + >>> parse_osc66_metadata('') + OSC66Metadata(scale=1, width=0, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) + """ + kwargs: dict[str, int] = {} + if not raw: + return OSC66Metadata() + for part in raw.split(':'): + if '=' not in part: + continue + key, _, val_str = part.partition('=') + if key not in _META_FIELDS: + continue + field, lo, hi, default = _META_FIELDS[key] + try: + kwargs[field] = max(lo, min(hi, int(val_str))) + except (ValueError, OverflowError): + kwargs[field] = default + return OSC66Metadata(**kwargs) + + +def make_osc66_metadata(meta: OSC66Metadata) -> str: + """Serialize metadata, omitting fields at their default values. + + :param meta: Metadata to serialize. + :returns: Colon-separated ``key=value`` string. + + Example:: + + >>> make_osc66_metadata(OSC66Metadata(scale=2, width=3)) + 's=2:w=3' + >>> make_osc66_metadata(OSC66Metadata()) + '' + """ + parts = [] + defaults = OSC66Metadata() + for field, key in _FIELD_TO_KEY.items(): + val = getattr(meta, field) + if val != getattr(defaults, field): + parts.append(f'{key}={val}') + return ':'.join(parts) + + +def parse_osc66_sequence(seq: str) -> tuple[OSC66Metadata, str, str] | None: + """Parse a complete OSC 66 escape sequence. + + :param seq: Full escape sequence string. + :returns: Tuple of ``(metadata, inner_text, terminator)`` or ``None`` + if the string is not a valid OSC 66 sequence. + + Example:: + + >>> parse_osc66_sequence('\x1b]66;s=2;hello\x07') + (OSC66Metadata(scale=2, ...), 'hello', '\x07') + >>> parse_osc66_sequence('\x1b[31m') is None + True + """ + match = OSC66_PATTERN.fullmatch(seq) + if not match: + return None + return ( + parse_osc66_metadata(match.group(1)), + match.group(2), + match.group(3), + ) + + +def osc66_width( + meta: OSC66Metadata, + inner_text: str, + ambiguous_width: int = 1, +) -> int: + """Calculate the display width of an OSC 66 sequence. + + :param meta: Parsed metadata. + :param inner_text: The text payload of the OSC 66 sequence. + :param ambiguous_width: Width for East Asian Ambiguous characters. + :returns: Display width in terminal cells. + + When ``meta.width > 0``, returns ``meta.scale * meta.width``. + When ``meta.width == 0``, returns ``meta.scale * measured_inner_width``. + """ + if meta.width > 0: + return meta.scale * meta.width + # Lazy import to avoid circular dependency (wcwidth -> osc66 -> wcwidth) + from .wcwidth import wcswidth # pylint: disable=import-outside-toplevel + inner_w = wcswidth(inner_text, ambiguous_width=ambiguous_width) + return meta.scale * max(0, inner_w) + + +def make_osc66_sequence( + text: str, + meta: OSC66Metadata, + terminator: str = '\x07', +) -> str: + r"""Build a complete OSC 66 escape sequence. + + :param text: Text payload. + :param meta: Metadata to encode. + :param terminator: Sequence terminator, ``'\x07'`` (BEL) or + ``'\x1b\\'`` (ST). Default is BEL. + :returns: Complete escape sequence string. + :raises ValueError: If text exceeds 4096 bytes when UTF-8 encoded. + + Example:: + + >>> make_osc66_sequence('hi', OSC66Metadata(scale=2, width=1)) + '\x1b]66;s=2:w=1;hi\x07' + """ + if len(text.encode('utf-8')) > _MAX_TEXT_PAYLOAD: + raise ValueError( + f"OSC 66 text payload exceeds {_MAX_TEXT_PAYLOAD} byte limit" + ) + metadata_str = make_osc66_metadata(meta) + return f'\x1b]66;{metadata_str};{text}{terminator}' + + +def osc66_wrap( + text: str, + *, + scale: int = 1, + width: int = 0, + numerator: int = 0, + denominator: int = 0, + vertical_align: int = 0, + horizontal_align: int = 0, + terminator: str = '\x07', +) -> str: + r"""Wrap text in an OSC 66 escape sequence with full control over metadata. + + :param text: Text payload. + :param scale: Scale factor (1--7). + :param width: Width in cells (0--7). 0 means auto-calculate. + :param numerator: Fractional scaling numerator (0--15). + :param denominator: Fractional scaling denominator (0--15). + :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). + :param horizontal_align: Horizontal alignment (0=left, 1=right, 2=center). + :param terminator: ``'\x07'`` (BEL) or ``'\x1b\\'`` (ST). + :returns: Complete OSC 66 escape sequence. + :raises ValueError: If text exceeds 4096 bytes. + + Example:: + + >>> osc66_wrap('AB', scale=2, width=2) + '\x1b]66;s=2:w=2;AB\x07' + """ + meta = OSC66Metadata( + scale=scale, + width=width, + numerator=numerator, + denominator=denominator, + vertical_align=vertical_align, + horizontal_align=horizontal_align, + ) + return make_osc66_sequence(text, meta, terminator) + + +def osc66_scale( + text: str, + scale: int, + *, + terminator: str = '\x07', + ambiguous_width: int = 1, +) -> str: + r"""Wrap text in an OSC 66 sequence, auto-calculating width from inner text. + + This is the most common use case: scale text to ``scale`` times its + natural width, with the ``w`` parameter set automatically. + + :param text: Text payload. + :param scale: Scale factor (1--7). + :param terminator: ``'\x07'`` (BEL) or ``'\x1b\\'`` (ST). + :param ambiguous_width: Width for East Asian Ambiguous characters. + :returns: Complete OSC 66 escape sequence with auto-calculated ``w``. + :raises ValueError: If text exceeds 4096 bytes. + + Example:: + + >>> osc66_scale('AB', 2) + '\x1b]66;s=2:w=2;AB\x07' + """ + from .wcwidth import wcswidth # pylint: disable=import-outside-toplevel + inner_w = wcswidth(text, ambiguous_width=ambiguous_width) + meta = OSC66Metadata(scale=scale, width=max(0, inner_w)) + return make_osc66_sequence(text, meta, terminator) + + +def _replace_osc66_with_padding( + text: str, + ambiguous_width: int = 1, +) -> str: + """Replace each OSC 66 sequence with spaces matching its declared width. + + Used internally by ``_width_ignored_codes`` to account for OSC 66 + width before stripping other sequences. + """ + def _replacer(match: 're.Match[str]') -> str: + meta = parse_osc66_metadata(match.group(1)) + inner_text = match.group(2) + w = osc66_width(meta, inner_text, ambiguous_width) + return ' ' * w + + return OSC66_PATTERN.sub(_replacer, text) diff --git a/wcwidth/table_grapheme.py b/wcwidth/table_grapheme.py index 42fd19e..3d8c7d3 100644 --- a/wcwidth/table_grapheme.py +++ b/wcwidth/table_grapheme.py @@ -4,7 +4,7 @@ This module provides lookup tables for Unicode grapheme cluster break properties as defined in UAX #29: Unicode Text Segmentation. -This code generated by wcwidth/bin/update-tables.py on 2026-01-29 23:33:42 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-03-29 04:41:09 UTC. """ # pylint: disable=duplicate-code @@ -202,8 +202,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B - (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa + (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) + (0x01ae0, 0x01aeb,), # (nil) (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b3d,), # Balinese Sign Rerekan ..Balinese Vowel Sign La L (0x01b42, 0x01b44,), # Balinese Vowel Sign Pepe..Balinese Adeg Adeg @@ -284,7 +284,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11001, 0x11001,), # Brahmi Sign Anusvara @@ -367,9 +367,9 @@ (0x11a59, 0x11a5b,), # Soyombo Vowel Sign Vocal..Soyombo Vowel Length Mar (0x11a8a, 0x11a96,), # Soyombo Final Consonant ..Soyombo Sign Anusvara (0x11a98, 0x11a99,), # Soyombo Gemination Mark ..Soyombo Subjoiner - (0x11b60, 0x11b60,), # Sharada Vowel Sign Oe - (0x11b62, 0x11b64,), # Sharada Vowel Sign Ue ..Sharada Vowel Sign Short - (0x11b66, 0x11b66,), # Sharada Vowel Sign Candra E + (0x11b60, 0x11b60,), # (nil) + (0x11b62, 0x11b64,), # (nil) + (0x11b66, 0x11b66,), # (nil) (0x11c30, 0x11c36,), # Bhaiksuki Vowel Sign I ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3d,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Anusvara (0x11c3f, 0x11c3f,), # Bhaiksuki Sign Virama @@ -426,10 +426,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue - (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au - (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang - (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om + (0x1e6e3, 0x1e6e3,), # (nil) + (0x1e6e6, 0x1e6e6,), # (nil) + (0x1e6ee, 0x1e6ef,), # (nil) + (0x1e6f5, 0x1e6f5,), # (nil) (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri @@ -617,9 +617,9 @@ (0x11a39, 0x11a39,), # Zanabazar Square Sign Visarga (0x11a57, 0x11a58,), # Soyombo Vowel Sign Ai ..Soyombo Vowel Sign Au (0x11a97, 0x11a97,), # Soyombo Sign Visarga - (0x11b61, 0x11b61,), # Sharada Vowel Sign Ooe - (0x11b65, 0x11b65,), # Sharada Vowel Sign Short O - (0x11b67, 0x11b67,), # Sharada Vowel Sign Candra O + (0x11b61, 0x11b61,), # (nil) + (0x11b65, 0x11b65,), # (nil) + (0x11b67, 0x11b67,), # (nil) (0x11c2f, 0x11c2f,), # Bhaiksuki Vowel Sign Aa (0x11c3e, 0x11c3e,), # Bhaiksuki Sign Visarga (0x11ca9, 0x11ca9,), # Marchen Subjoined Letter Ya @@ -1892,8 +1892,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B - (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa + (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) + (0x01ae0, 0x01aeb,), # (nil) (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b3d,), # Balinese Sign Rerekan ..Balinese Vowel Sign La L (0x01b42, 0x01b43,), # Balinese Vowel Sign Pepe..Balinese Vowel Sign Pepe @@ -1972,7 +1972,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11001, 0x11001,), # Brahmi Sign Anusvara @@ -2055,9 +2055,9 @@ (0x11a59, 0x11a5b,), # Soyombo Vowel Sign Vocal..Soyombo Vowel Length Mar (0x11a8a, 0x11a96,), # Soyombo Final Consonant ..Soyombo Sign Anusvara (0x11a98, 0x11a98,), # Soyombo Gemination Mark - (0x11b60, 0x11b60,), # Sharada Vowel Sign Oe - (0x11b62, 0x11b64,), # Sharada Vowel Sign Ue ..Sharada Vowel Sign Short - (0x11b66, 0x11b66,), # Sharada Vowel Sign Candra E + (0x11b60, 0x11b60,), # (nil) + (0x11b62, 0x11b64,), # (nil) + (0x11b66, 0x11b66,), # (nil) (0x11c30, 0x11c36,), # Bhaiksuki Vowel Sign I ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3d,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Anusvara (0x11c3f, 0x11c3f,), # Bhaiksuki Sign Virama @@ -2114,10 +2114,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue - (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au - (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang - (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om + (0x1e6e3, 0x1e6e3,), # (nil) + (0x1e6e6, 0x1e6e6,), # (nil) + (0x1e6ee, 0x1e6ef,), # (nil) + (0x1e6f5, 0x1e6f5,), # (nil) (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri diff --git a/wcwidth/table_mc.py b/wcwidth/table_mc.py index 7c2e691..59cce63 100644 --- a/wcwidth/table_mc.py +++ b/wcwidth/table_mc.py @@ -1,7 +1,7 @@ """ Exports CATEGORY_MC table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-29 00:47:54 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-03-29 04:41:09 UTC. """ # pylint: disable=duplicate-code CATEGORY_MC = { @@ -181,9 +181,9 @@ (0x11a39, 0x11a39,), # Zanabazar Square Sign Visarga (0x11a57, 0x11a58,), # Soyombo Vowel Sign Ai ..Soyombo Vowel Sign Au (0x11a97, 0x11a97,), # Soyombo Sign Visarga - (0x11b61, 0x11b61,), # Sharada Vowel Sign Ooe - (0x11b65, 0x11b65,), # Sharada Vowel Sign Short O - (0x11b67, 0x11b67,), # Sharada Vowel Sign Candra O + (0x11b61, 0x11b61,), # (nil) + (0x11b65, 0x11b65,), # (nil) + (0x11b67, 0x11b67,), # (nil) (0x11c2f, 0x11c2f,), # Bhaiksuki Vowel Sign Aa (0x11c3e, 0x11c3e,), # Bhaiksuki Sign Visarga (0x11ca9, 0x11ca9,), # Marchen Subjoined Letter Ya diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py index ed6f48a..0f0385e 100644 --- a/wcwidth/table_wide.py +++ b/wcwidth/table_wide.py @@ -1,7 +1,7 @@ """ Exports WIDE_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:58:17 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-03-29 04:41:09 UTC. """ # pylint: disable=duplicate-code WIDE_EASTASIAN = { @@ -71,10 +71,10 @@ (0x0ff01, 0x0ff60,), # Fullwidth Exclamation Ma..Fullwidth Right White Pa (0x0ffe0, 0x0ffe6,), # Fullwidth Cent Sign ..Fullwidth Won Sign (0x16fe0, 0x16fe3,), # Tangut Iteration Mark ..Old Chinese Iteration Ma - (0x16ff2, 0x16ff6,), # Chinese Small Simplified..Yangqin Sign Slow Two Be + (0x16ff2, 0x16ff6,), # (nil) (0x17000, 0x18cd5,), # (nil) ..Khitan Small Script Char (0x18cff, 0x18d1e,), # Khitan Small Script Char..(nil) - (0x18d80, 0x18df2,), # Tangut Component-769 ..Tangut Component-883 + (0x18d80, 0x18df2,), # (nil) (0x1aff0, 0x1aff3,), # Katakana Letter Minnan T..Katakana Letter Minnan T (0x1aff5, 0x1affb,), # Katakana Letter Minnan T..Katakana Letter Minnan N (0x1affd, 0x1affe,), # Katakana Letter Minnan N..Katakana Letter Minnan N @@ -116,7 +116,7 @@ (0x1f680, 0x1f6c5,), # Rocket ..Left Luggage (0x1f6cc, 0x1f6cc,), # Sleeping Accommodation (0x1f6d0, 0x1f6d2,), # Place Of Worship ..Shopping Trolley - (0x1f6d5, 0x1f6d8,), # Hindu Temple ..Landslide + (0x1f6d5, 0x1f6d8,), # Hindu Temple ..(nil) (0x1f6dc, 0x1f6df,), # Wireless ..Ring Buoy (0x1f6eb, 0x1f6ec,), # Airplane Departure ..Airplane Arriving (0x1f6f4, 0x1f6fc,), # Scooter ..Roller Skate @@ -126,12 +126,12 @@ (0x1f93c, 0x1f945,), # Wrestlers ..Goal Net (0x1f947, 0x1f9ff,), # First Place Medal ..Nazar Amulet (0x1fa70, 0x1fa7c,), # Ballet Shoes ..Crutch - (0x1fa80, 0x1fa8a,), # Yo-yo ..Trombone - (0x1fa8e, 0x1fac6,), # Treasure Chest ..Fingerprint - (0x1fac8, 0x1fac8,), # Hairy Creature - (0x1facd, 0x1fadc,), # Orca ..Root Vegetable - (0x1fadf, 0x1faea,), # Splatter ..Distorted Face - (0x1faef, 0x1faf8,), # Fight Cloud ..Rightwards Pushing Hand + (0x1fa80, 0x1fa8a,), # Yo-yo ..(nil) + (0x1fa8e, 0x1fac6,), # (nil) ..Fingerprint + (0x1fac8, 0x1fac8,), # (nil) + (0x1facd, 0x1fadc,), # (nil) ..Root Vegetable + (0x1fadf, 0x1faea,), # Splatter ..(nil) + (0x1faef, 0x1faf8,), # (nil) ..Rightwards Pushing Hand (0x20000, 0x2fffd,), # Cjk Unified Ideograph-20..(nil) (0x30000, 0x3fffd,), # Cjk Unified Ideograph-30..(nil) ), diff --git a/wcwidth/table_zero.py b/wcwidth/table_zero.py index c440bfc..cb4bdba 100644 --- a/wcwidth/table_zero.py +++ b/wcwidth/table_zero.py @@ -1,7 +1,7 @@ """ Exports ZERO_WIDTH table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:48:24 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-03-29 04:41:09 UTC. """ # pylint: disable=duplicate-code ZERO_WIDTH = { @@ -147,8 +147,8 @@ (0x01a55, 0x01a5e,), # Tai Tham Consonant Sign ..Tai Tham Consonant Sign (0x01a60, 0x01a7c,), # Tai Tham Sign Sakot ..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B - (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa + (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) + (0x01ae0, 0x01aeb,), # (nil) (0x01b00, 0x01b04,), # Balinese Sign Ulu Ricem ..Balinese Sign Bisah (0x01b34, 0x01b44,), # Balinese Sign Rerekan ..Balinese Adeg Adeg (0x01b6b, 0x01b73,), # Balinese Musical Symbol ..Balinese Musical Symbol @@ -222,7 +222,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11000, 0x11002,), # Brahmi Sign Candrabindu ..Brahmi Sign Visarga @@ -284,7 +284,7 @@ (0x11a47, 0x11a47,), # Zanabazar Square Subjoiner (0x11a51, 0x11a5b,), # Soyombo Vowel Sign I ..Soyombo Vowel Length Mar (0x11a8a, 0x11a99,), # Soyombo Final Consonant ..Soyombo Subjoiner - (0x11b60, 0x11b67,), # Sharada Vowel Sign Oe ..Sharada Vowel Sign Candr + (0x11b60, 0x11b67,), # (nil) (0x11c2f, 0x11c36,), # Bhaiksuki Vowel Sign Aa ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3f,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Virama (0x11c92, 0x11ca7,), # Marchen Subjoined Letter..Marchen Subjoined Letter @@ -339,10 +339,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue - (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au - (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang - (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om + (0x1e6e3, 0x1e6e3,), # (nil) + (0x1e6e6, 0x1e6e6,), # (nil) + (0x1e6ee, 0x1e6ef,), # (nil) + (0x1e6f5, 0x1e6f5,), # (nil) (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0xe0000, 0xe0fff,), # (nil) diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index f6edf5f..6719954 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -84,9 +84,13 @@ from .table_grapheme import ISC_CONSONANT, EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR from .table_ambiguous import AMBIGUOUS_EASTASIAN from .escape_sequences import (ZERO_WIDTH_PATTERN, + OSC66_PATTERN, CURSOR_LEFT_SEQUENCE, CURSOR_RIGHT_SEQUENCE, INDETERMINATE_EFFECT_SEQUENCE) +from .osc66 import (parse_osc66_metadata, + osc66_width as _osc66_width, + _replace_osc66_with_padding) from .unicode_versions import list_versions if TYPE_CHECKING: # pragma: no cover @@ -463,7 +467,10 @@ def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int: Fast path for width() with control_codes='ignore'. Strips escape sequences and control characters, then measures remaining text. + OSC 66 sequences are replaced with padding of correct width before stripping. """ + if '\x1b]66;' in text: + text = _replace_osc66_with_padding(text, ambiguous_width) return wcswidth( strip_sequences(text).translate(_CONTROL_CHAR_TABLE), ambiguous_width=ambiguous_width @@ -577,6 +584,17 @@ def width( # 1. Handle ESC sequences if char == '\x1b': + # 1a. OSC 66 (text sizing) has positive width — check before zero-width path + if text[idx:idx + 5] == '\x1b]66;': + osc66_match = OSC66_PATTERN.match(text, idx) + if osc66_match: + meta = parse_osc66_metadata(osc66_match.group(1)) + current_col += _osc66_width( + meta, osc66_match.group(2), ambiguous_width + ) + idx = osc66_match.end() + max_extent = max(max_extent, current_col) + continue match = ZERO_WIDTH_PATTERN.match(text, idx) if match: seq = match.group() @@ -861,6 +879,8 @@ def strip_sequences(text: str) -> str: >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text') 'bold red text' """ + if '\x1b]66;' in text: + text = OSC66_PATTERN.sub(r'\2', text) return ZERO_WIDTH_PATTERN.sub('', text) @@ -958,16 +978,44 @@ def clip( break # Handle escape sequences - if char == '\x1b' and (match := ZERO_WIDTH_PATTERN.match(text, idx)): - seq = match.group() - if propagate_sgr and _SGR_PATTERN.match(seq): - # Update SGR state; will be applied as prefix when visible content starts - sgr = _sgr_state_update(sgr, seq) - else: - # Non-SGR sequences always preserved - output.append(seq) - idx = match.end() - continue + if char == '\x1b': + # OSC 66 (text sizing) has positive width — handle before zero-width path + if text[idx:idx + 5] == '\x1b]66;': + osc66_match = OSC66_PATTERN.match(text, idx) + if osc66_match: + meta = parse_osc66_metadata(osc66_match.group(1)) + w = _osc66_width( + meta, osc66_match.group(2), ambiguous_width + ) + if w == 0: + if start <= col < end: + output.append(osc66_match.group()) + elif col >= start and col + w <= end: + output.append(osc66_match.group()) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + col += w + elif col < end and col + w > start: + visible = min(end, col + w) - max(start, col) + output.append(fillchar * visible) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + col += w + else: + col += w + idx = osc66_match.end() + continue + + if (match := ZERO_WIDTH_PATTERN.match(text, idx)): + seq = match.group() + if propagate_sgr and _SGR_PATTERN.match(seq): + # Update SGR state; will be applied as prefix when visible content starts + sgr = _sgr_state_update(sgr, seq) + else: + # Non-SGR sequences always preserved + output.append(seq) + idx = match.end() + continue # Handle bare ESC (not a valid sequence) if char == '\x1b': From f1249fc1fe0b2d854a52ccb9fcbf47085c56a242 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 10 Apr 2026 17:18:58 -0400 Subject: [PATCH 02/70] Mass rename 'OSC 66' -> 'Text Sizing' --- docs/specs.rst | 5 +- tests/test_osc66.py | 310 ----------------------------------- tests/test_text_sizing.py | 316 ++++++++++++++++++++++++++++++++++++ wcwidth/__init__.py | 9 +- wcwidth/escape_sequences.py | 4 +- wcwidth/osc66.py | 280 -------------------------------- wcwidth/text_sizing.py | 155 ++++++++++++++++++ wcwidth/wcwidth.py | 40 ++--- 8 files changed, 498 insertions(+), 621 deletions(-) delete mode 100644 tests/test_osc66.py create mode 100644 tests/test_text_sizing.py delete mode 100644 wcwidth/osc66.py create mode 100644 wcwidth/text_sizing.py diff --git a/docs/specs.rst b/docs/specs.rst index ef4ef31..54d353e 100644 --- a/docs/specs.rst +++ b/docs/specs.rst @@ -162,9 +162,8 @@ sequences while stripping the escape wrapper. declared width. If the sequence straddles a clip boundary, it is replaced with fill characters. -Generation functions :func:`wcwidth.osc66_wrap` and :func:`wcwidth.osc66_scale` -produce correctly formatted OSC 66 sequences. The text payload is limited to -4096 bytes (UTF-8 encoded) per the protocol specification. +Sequence generation (emitting OSC 66) is handled by terminal libraries such +as ``blessed``, not by this width-measurement library. See also: `Kitty Text Sizing Protocol`_. diff --git a/tests/test_osc66.py b/tests/test_osc66.py deleted file mode 100644 index 1ba1128..0000000 --- a/tests/test_osc66.py +++ /dev/null @@ -1,310 +0,0 @@ -"""Tests for OSC 66 (Kitty Text Sizing Protocol) support.""" -# 3rd party -import pytest - -# local -import wcwidth -from wcwidth.osc66 import ( - OSC66Metadata, - parse_osc66_metadata, - make_osc66_metadata, - parse_osc66_sequence, - osc66_width, - make_osc66_sequence, - osc66_wrap, - osc66_scale, - _replace_osc66_with_padding, -) - - -PARSE_METADATA_CASES = [ - ('', OSC66Metadata()), - ('s=2', OSC66Metadata(scale=2)), - ('w=3', OSC66Metadata(width=3)), - ('s=2:w=3', OSC66Metadata(scale=2, width=3)), - ('s=2:w=3:n=1:d=2:v=1:h=2', - OSC66Metadata(scale=2, width=3, numerator=1, denominator=2, - vertical_align=1, horizontal_align=2)), - ('n=5:d=10', OSC66Metadata(numerator=5, denominator=10)), - ('v=0:h=0', OSC66Metadata()), - ('s=1:w=0', OSC66Metadata()), -] - - -@pytest.mark.parametrize('raw,expected', PARSE_METADATA_CASES) -def test_parse_osc66_metadata(raw, expected): - assert parse_osc66_metadata(raw) == expected - - -PARSE_METADATA_CLAMP_CASES = [ - ('s=0', OSC66Metadata(scale=1)), - ('s=9', OSC66Metadata(scale=7)), - ('w=8', OSC66Metadata(width=7)), - ('n=20', OSC66Metadata(numerator=15)), - ('d=99', OSC66Metadata(denominator=15)), - ('v=5', OSC66Metadata(vertical_align=2)), - ('h=3', OSC66Metadata(horizontal_align=2)), - ('w=-1', OSC66Metadata(width=0)), -] - - -@pytest.mark.parametrize('raw,expected', PARSE_METADATA_CLAMP_CASES) -def test_parse_osc66_metadata_clamp(raw, expected): - assert parse_osc66_metadata(raw) == expected - - -PARSE_METADATA_EDGE_CASES = [ - ('unknown=5', OSC66Metadata()), - ('s=2:unknown=5:w=3', OSC66Metadata(scale=2, width=3)), - ('s=abc', OSC66Metadata()), - ('s=', OSC66Metadata()), - ('noequalssign', OSC66Metadata()), - ('s=2:w=3:', OSC66Metadata(scale=2, width=3)), - (':s=2', OSC66Metadata(scale=2)), -] - - -@pytest.mark.parametrize('raw,expected', PARSE_METADATA_EDGE_CASES) -def test_parse_osc66_metadata_edge(raw, expected): - assert parse_osc66_metadata(raw) == expected - - -MAKE_METADATA_CASES = [ - (OSC66Metadata(), ''), - (OSC66Metadata(scale=2), 's=2'), - (OSC66Metadata(width=3), 'w=3'), - (OSC66Metadata(scale=2, width=3), 's=2:w=3'), - (OSC66Metadata(scale=2, width=3, numerator=1, denominator=2, - vertical_align=1, horizontal_align=2), - 's=2:w=3:n=1:d=2:v=1:h=2'), -] - - -@pytest.mark.parametrize('meta,expected', MAKE_METADATA_CASES) -def test_make_osc66_metadata(meta, expected): - assert make_osc66_metadata(meta) == expected - - -METADATA_ROUNDTRIP_CASES = [ - OSC66Metadata(), - OSC66Metadata(scale=3), - OSC66Metadata(scale=2, width=5), - OSC66Metadata(scale=7, width=7, numerator=15, denominator=15, - vertical_align=2, horizontal_align=2), - OSC66Metadata(numerator=1, denominator=2), -] - - -@pytest.mark.parametrize('meta', METADATA_ROUNDTRIP_CASES) -def test_metadata_roundtrip(meta): - assert parse_osc66_metadata(make_osc66_metadata(meta)) == meta - - -PARSE_SEQUENCE_CASES = [ - ('\x1b]66;s=2;hello\x07', - (OSC66Metadata(scale=2), 'hello', '\x07')), - ('\x1b]66;s=2;hello\x1b\\', - (OSC66Metadata(scale=2), 'hello', '\x1b\\')), - ('\x1b]66;;text\x07', - (OSC66Metadata(), 'text', '\x07')), - ('\x1b]66;s=3:w=2;\x07', - (OSC66Metadata(scale=3, width=2), '', '\x07')), - ('\x1b]66;w=5;AB\x07', - (OSC66Metadata(width=5), 'AB', '\x07')), -] - - -@pytest.mark.parametrize('seq,expected', PARSE_SEQUENCE_CASES) -def test_parse_osc66_sequence(seq, expected): - assert parse_osc66_sequence(seq) == expected - - -PARSE_SEQUENCE_NONE_CASES = [ - '\x1b[31m', - '\x1b]0;title\x07', - '\x1b]65;s=2;text\x07', - 'plain text', - '', - '\x1b]66;missing_second_semi\x07', -] - - -@pytest.mark.parametrize('seq', PARSE_SEQUENCE_NONE_CASES) -def test_parse_osc66_sequence_none(seq): - assert parse_osc66_sequence(seq) is None - - -OSC66_WIDTH_CASES = [ - (OSC66Metadata(scale=2, width=3), 'anything', 6), - (OSC66Metadata(scale=1, width=5), '', 5), - (OSC66Metadata(scale=3, width=1), 'x', 3), - (OSC66Metadata(scale=1, width=0), 'AB', 2), - (OSC66Metadata(scale=2, width=0), 'AB', 4), - (OSC66Metadata(scale=1, width=0), '\u4e2d', 2), - (OSC66Metadata(scale=2, width=0), '\u4e2d', 4), - (OSC66Metadata(scale=1, width=0), '', 0), - (OSC66Metadata(scale=3, width=0), '', 0), -] - - -@pytest.mark.parametrize('meta,inner,expected', OSC66_WIDTH_CASES) -def test_osc66_width(meta, inner, expected): - assert osc66_width(meta, inner) == expected - - -MAKE_SEQUENCE_CASES = [ - ('hi', OSC66Metadata(scale=2, width=1), '\x07', - '\x1b]66;s=2:w=1;hi\x07'), - ('AB', OSC66Metadata(scale=2, width=2), '\x1b\\', - '\x1b]66;s=2:w=2;AB\x1b\\'), - ('x', OSC66Metadata(), '\x07', - '\x1b]66;;x\x07'), - ('', OSC66Metadata(scale=3, width=2), '\x07', - '\x1b]66;s=3:w=2;\x07'), -] - - -@pytest.mark.parametrize('text,meta,term,expected', MAKE_SEQUENCE_CASES) -def test_make_osc66_sequence(text, meta, term, expected): - assert make_osc66_sequence(text, meta, term) == expected - - -def test_make_osc66_sequence_payload_limit(): - text = 'x' * 4097 - with pytest.raises(ValueError, match='4096'): - make_osc66_sequence(text, OSC66Metadata()) - - -WRAP_CASES = [ - (dict(text='AB', scale=2, width=2), - '\x1b]66;s=2:w=2;AB\x07'), - (dict(text='AB', scale=2, width=2, terminator='\x1b\\'), - '\x1b]66;s=2:w=2;AB\x1b\\'), - (dict(text='x', scale=1), - '\x1b]66;;x\x07'), - (dict(text='hi', scale=3, width=1, numerator=1, denominator=2, - vertical_align=1, horizontal_align=2), - '\x1b]66;s=3:w=1:n=1:d=2:v=1:h=2;hi\x07'), -] - - -@pytest.mark.parametrize('kwargs,expected', WRAP_CASES) -def test_osc66_wrap(kwargs, expected): - assert osc66_wrap(**kwargs) == expected - - -SCALE_CASES = [ - ('AB', 2, '\x1b]66;s=2:w=2;AB\x07'), - ('\u4e2d', 2, '\x1b]66;s=2:w=2;\u4e2d\x07'), - ('x', 3, '\x1b]66;s=3:w=1;x\x07'), - ('hello', 1, '\x1b]66;w=5;hello\x07'), -] - - -@pytest.mark.parametrize('text,scale,expected', SCALE_CASES) -def test_osc66_scale(text, scale, expected): - assert osc66_scale(text, scale) == expected - - -def test_osc66_scale_st_terminator(): - result = osc66_scale('AB', 2, terminator='\x1b\\') - assert result == '\x1b]66;s=2:w=2;AB\x1b\\' - - -# --- Integration tests: width() --- - -WIDTH_PARSE_CASES = [ - ('\x1b]66;s=2:w=3;anything\x07', 6), - ('\x1b]66;w=3;x\x07', 3), - ('\x1b]66;s=1:w=0;AB\x07', 2), - ('\x1b]66;s=2:w=0;AB\x07', 4), - ('\x1b]66;s=2:w=0;\u4e2d\x07', 4), - ('\x1b]66;s=1:w=0;\x07', 0), - ('abc\x1b]66;w=3;x\x07def', 9), - ('\x1b]66;w=2;A\x07\x1b]66;w=3;B\x07', 5), - ('\x1b]66;s=2:w=3;text\x1b\\', 6), - ('\x1b[31m\x1b]66;w=2;AB\x07\x1b[0m', 2), -] - - -@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) -def test_width_osc66_parse(text, expected): - assert wcwidth.width(text) == expected - - -@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) -def test_width_osc66_ignore(text, expected): - assert wcwidth.width(text, control_codes='ignore') == expected - - -@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) -def test_width_osc66_strict(text, expected): - assert wcwidth.width(text, control_codes='strict') == expected - - -# --- Integration tests: strip_sequences() --- - -STRIP_OSC66_CASES = [ - ('\x1b]66;s=2;hello\x07', 'hello'), - ('\x1b]66;s=2;hello\x1b\\', 'hello'), - ('\x1b]66;;text\x07', 'text'), - ('\x1b]66;s=3:w=2;\x07', ''), - ('abc\x1b]66;w=2;XY\x07def', 'abcXYdef'), - ('\x1b[31m\x1b]66;s=2;red\x07\x1b[0m', 'red'), - ('\x1b]66;w=1;A\x07\x1b]66;w=1;B\x07', 'AB'), -] - - -@pytest.mark.parametrize('text,expected', STRIP_OSC66_CASES) -def test_strip_sequences_osc66(text, expected): - assert wcwidth.strip_sequences(text) == expected - - -# --- Integration tests: iter_sequences() --- - -def test_iter_sequences_osc66(): - text = 'abc\x1b]66;s=2;hello\x07def' - segments = list(wcwidth.iter_sequences(text)) - assert segments == [ - ('abc', False), - ('\x1b]66;s=2;hello\x07', True), - ('def', False), - ] - - -def test_iter_sequences_osc66_st(): - text = '\x1b]66;w=2;AB\x1b\\' - segments = list(wcwidth.iter_sequences(text)) - assert segments == [('\x1b]66;w=2;AB\x1b\\', True)] - - -# --- Integration tests: clip() --- - -CLIP_OSC66_CASES = [ - ('\x1b]66;w=3;ABC\x07', 0, 3, '\x1b]66;w=3;ABC\x07'), - ('\x1b]66;w=3;ABC\x07', 0, 2, ' '), - ('\x1b]66;w=3;ABC\x07', 1, 3, ' '), - ('ab\x1b]66;w=2;XY\x07cd', 0, 6, 'ab\x1b]66;w=2;XY\x07cd'), - ('ab\x1b]66;w=2;XY\x07cd', 0, 3, 'ab '), - ('ab\x1b]66;w=2;XY\x07cd', 4, 6, 'cd'), -] - - -@pytest.mark.parametrize('text,start,end,expected', CLIP_OSC66_CASES) -def test_clip_osc66(text, start, end, expected): - assert wcwidth.clip(text, start, end) == expected - - -# --- Internal helper --- - -REPLACE_PADDING_CASES = [ - ('\x1b]66;w=3;x\x07', ' '), - ('\x1b]66;s=2:w=2;AB\x07', ' '), - ('abc\x1b]66;w=1;x\x07def', 'abc def'), - ('no osc66 here', 'no osc66 here'), -] - - -@pytest.mark.parametrize('text,expected', REPLACE_PADDING_CASES) -def test_replace_osc66_with_padding(text, expected): - assert _replace_osc66_with_padding(text) == expected diff --git a/tests/test_text_sizing.py b/tests/test_text_sizing.py new file mode 100644 index 0000000..8ba6603 --- /dev/null +++ b/tests/test_text_sizing.py @@ -0,0 +1,316 @@ +"""Tests for Text Sizing Protocol (OSC 66) support.""" +# 3rd party +import pytest + +# local +import wcwidth +from wcwidth.text_sizing import ( + TextSizingParams, + parse_text_sizing_params, + parse_text_sizing, + text_sizing_width, + _replace_text_sizing_with_padding, +) + + +# -- Test-only helpers for generating OSC 66 sequences -- + +_FIELD_TO_KEY = { + 'scale': 's', 'width': 'w', 'numerator': 'n', + 'denominator': 'd', 'vertical_align': 'v', 'horizontal_align': 'h', +} + +_DEFAULTS = TextSizingParams() + + +def _make_params_str(params): + """Serialize TextSizingParams to colon-separated key=value string.""" + parts = [] + for field, key in _FIELD_TO_KEY.items(): + val = getattr(params, field) + if val != getattr(_DEFAULTS, field): + parts.append(f'{key}={val}') + return ':'.join(parts) + + +def _make_seq(text, params=None, terminator='\x07', **kwargs): + """Build a complete OSC 66 escape sequence for testing.""" + if params is None: + params = TextSizingParams(**kwargs) + return f'\x1b]66;{_make_params_str(params)};{text}{terminator}' + + +PARSE_PARAMS_CASES = [ + ('', TextSizingParams()), + ('s=2', TextSizingParams(scale=2)), + ('w=3', TextSizingParams(width=3)), + ('s=2:w=3', TextSizingParams(scale=2, width=3)), + ('s=2:w=3:n=1:d=2:v=1:h=2', + TextSizingParams(scale=2, width=3, numerator=1, denominator=2, + vertical_align=1, horizontal_align=2)), + ('n=5:d=10', TextSizingParams(numerator=5, denominator=10)), + ('v=0:h=0', TextSizingParams()), + ('s=1:w=0', TextSizingParams()), +] + + +@pytest.mark.parametrize('raw,expected', PARSE_PARAMS_CASES) +def test_parse_text_sizing_params(raw, expected): + assert parse_text_sizing_params(raw) == expected + + +PARSE_PARAMS_CLAMP_CASES = [ + ('s=0', TextSizingParams(scale=1)), + ('s=9', TextSizingParams(scale=7)), + ('w=8', TextSizingParams(width=7)), + ('n=20', TextSizingParams(numerator=15)), + ('d=99', TextSizingParams(denominator=15)), + ('v=5', TextSizingParams(vertical_align=2)), + ('h=3', TextSizingParams(horizontal_align=2)), + ('w=-1', TextSizingParams(width=0)), +] + + +@pytest.mark.parametrize('raw,expected', PARSE_PARAMS_CLAMP_CASES) +def test_parse_text_sizing_params_clamp(raw, expected): + assert parse_text_sizing_params(raw) == expected + + +PARSE_PARAMS_EDGE_CASES = [ + ('unknown=5', TextSizingParams()), + ('s=2:unknown=5:w=3', TextSizingParams(scale=2, width=3)), + ('s=abc', TextSizingParams()), + ('s=', TextSizingParams()), + ('noequalssign', TextSizingParams()), + ('s=2:w=3:', TextSizingParams(scale=2, width=3)), + (':s=2', TextSizingParams(scale=2)), +] + + +@pytest.mark.parametrize('raw,expected', PARSE_PARAMS_EDGE_CASES) +def test_parse_text_sizing_params_edge(raw, expected): + assert parse_text_sizing_params(raw) == expected + + +PARAMS_ROUNDTRIP_CASES = [ + TextSizingParams(), + TextSizingParams(scale=3), + TextSizingParams(scale=2, width=5), + TextSizingParams(scale=7, width=7, numerator=15, denominator=15, + vertical_align=2, horizontal_align=2), + TextSizingParams(numerator=1, denominator=2), +] + + +@pytest.mark.parametrize('params', PARAMS_ROUNDTRIP_CASES) +def test_params_roundtrip(params): + assert parse_text_sizing_params(_make_params_str(params)) == params + + +PARSE_SEQUENCE_CASES = [ + ('\x1b]66;s=2;hello\x07', + (TextSizingParams(scale=2), 'hello', '\x07')), + ('\x1b]66;s=2;hello\x1b\\', + (TextSizingParams(scale=2), 'hello', '\x1b\\')), + ('\x1b]66;;text\x07', + (TextSizingParams(), 'text', '\x07')), + ('\x1b]66;s=3:w=2;\x07', + (TextSizingParams(scale=3, width=2), '', '\x07')), + ('\x1b]66;w=5;AB\x07', + (TextSizingParams(width=5), 'AB', '\x07')), +] + + +@pytest.mark.parametrize('seq,expected', PARSE_SEQUENCE_CASES) +def test_parse_text_sizing(seq, expected): + assert parse_text_sizing(seq) == expected + + +PARSE_SEQUENCE_NONE_CASES = [ + '\x1b[31m', + '\x1b]0;title\x07', + '\x1b]65;s=2;text\x07', + 'plain text', + '', + '\x1b]66;missing_second_semi\x07', +] + + +@pytest.mark.parametrize('seq', PARSE_SEQUENCE_NONE_CASES) +def test_parse_text_sizing_none(seq): + assert parse_text_sizing(seq) is None + + +TEXT_SIZING_WIDTH_CASES = [ + (TextSizingParams(scale=2, width=3), 'anything', 6), + (TextSizingParams(scale=1, width=5), '', 5), + (TextSizingParams(scale=3, width=1), 'x', 3), + (TextSizingParams(scale=1, width=0), 'AB', 2), + (TextSizingParams(scale=2, width=0), 'AB', 4), + (TextSizingParams(scale=1, width=0), '\u4e2d', 2), + (TextSizingParams(scale=2, width=0), '\u4e2d', 4), + (TextSizingParams(scale=1, width=0), '', 0), + (TextSizingParams(scale=3, width=0), '', 0), +] + + +@pytest.mark.parametrize('params,inner,expected', TEXT_SIZING_WIDTH_CASES) +def test_text_sizing_width(params, inner, expected): + assert text_sizing_width(params, inner) == expected + + +MAKE_SEQUENCE_CASES = [ + ('hi', dict(scale=2, width=1), '\x07', + '\x1b]66;s=2:w=1;hi\x07'), + ('AB', dict(scale=2, width=2), '\x1b\\', + '\x1b]66;s=2:w=2;AB\x1b\\'), + ('x', {}, '\x07', + '\x1b]66;;x\x07'), + ('', dict(scale=3, width=2), '\x07', + '\x1b]66;s=3:w=2;\x07'), +] + + +@pytest.mark.parametrize('text,kwargs,term,expected', MAKE_SEQUENCE_CASES) +def test_make_sequence(text, kwargs, term, expected): + assert _make_seq(text, terminator=term, **kwargs) == expected + + +WRAP_CASES = [ + (dict(text='AB', scale=2, width=2), + '\x1b]66;s=2:w=2;AB\x07'), + (dict(text='AB', scale=2, width=2, terminator='\x1b\\'), + '\x1b]66;s=2:w=2;AB\x1b\\'), + (dict(text='x', scale=1), + '\x1b]66;;x\x07'), + (dict(text='hi', scale=3, width=1, numerator=1, denominator=2, + vertical_align=1, horizontal_align=2), + '\x1b]66;s=3:w=1:n=1:d=2:v=1:h=2;hi\x07'), +] + + +@pytest.mark.parametrize('kwargs,expected', WRAP_CASES) +def test_wrap(kwargs, expected): + text = kwargs.pop('text') + terminator = kwargs.pop('terminator', '\x07') + assert _make_seq(text, terminator=terminator, **kwargs) == expected + + +SCALE_CASES = [ + ('AB', 2, '\x1b]66;s=2:w=2;AB\x07'), + ('\u4e2d', 2, '\x1b]66;s=2:w=2;\u4e2d\x07'), + ('x', 3, '\x1b]66;s=3:w=1;x\x07'), + ('hello', 1, '\x1b]66;w=5;hello\x07'), +] + + +@pytest.mark.parametrize('text,scale,expected', SCALE_CASES) +def test_scale(text, scale, expected): + inner_w = wcwidth.wcswidth(text) + assert _make_seq(text, scale=scale, width=max(0, inner_w)) == expected + + +def test_scale_st_terminator(): + text, scale = 'AB', 2 + inner_w = wcwidth.wcswidth(text) + result = _make_seq(text, scale=scale, width=max(0, inner_w), terminator='\x1b\\') + assert result == '\x1b]66;s=2:w=2;AB\x1b\\' + + +# --- Integration tests: width() --- + +WIDTH_PARSE_CASES = [ + ('\x1b]66;s=2:w=3;anything\x07', 6), + ('\x1b]66;w=3;x\x07', 3), + ('\x1b]66;s=1:w=0;AB\x07', 2), + ('\x1b]66;s=2:w=0;AB\x07', 4), + ('\x1b]66;s=2:w=0;\u4e2d\x07', 4), + ('\x1b]66;s=1:w=0;\x07', 0), + ('abc\x1b]66;w=3;x\x07def', 9), + ('\x1b]66;w=2;A\x07\x1b]66;w=3;B\x07', 5), + ('\x1b]66;s=2:w=3;text\x1b\\', 6), + ('\x1b[31m\x1b]66;w=2;AB\x07\x1b[0m', 2), +] + + +@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) +def test_width_text_sizing_parse(text, expected): + assert wcwidth.width(text) == expected + + +@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) +def test_width_text_sizing_ignore(text, expected): + assert wcwidth.width(text, control_codes='ignore') == expected + + +@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) +def test_width_text_sizing_strict(text, expected): + assert wcwidth.width(text, control_codes='strict') == expected + + +# --- Integration tests: strip_sequences() --- + +STRIP_TEXT_SIZING_CASES = [ + ('\x1b]66;s=2;hello\x07', 'hello'), + ('\x1b]66;s=2;hello\x1b\\', 'hello'), + ('\x1b]66;;text\x07', 'text'), + ('\x1b]66;s=3:w=2;\x07', ''), + ('abc\x1b]66;w=2;XY\x07def', 'abcXYdef'), + ('\x1b[31m\x1b]66;s=2;red\x07\x1b[0m', 'red'), + ('\x1b]66;w=1;A\x07\x1b]66;w=1;B\x07', 'AB'), +] + + +@pytest.mark.parametrize('text,expected', STRIP_TEXT_SIZING_CASES) +def test_strip_sequences_text_sizing(text, expected): + assert wcwidth.strip_sequences(text) == expected + + +# --- Integration tests: iter_sequences() --- + +def test_iter_sequences_text_sizing(): + text = 'abc\x1b]66;s=2;hello\x07def' + segments = list(wcwidth.iter_sequences(text)) + assert segments == [ + ('abc', False), + ('\x1b]66;s=2;hello\x07', True), + ('def', False), + ] + + +def test_iter_sequences_text_sizing_st(): + text = '\x1b]66;w=2;AB\x1b\\' + segments = list(wcwidth.iter_sequences(text)) + assert segments == [('\x1b]66;w=2;AB\x1b\\', True)] + + +# --- Integration tests: clip() --- + +CLIP_TEXT_SIZING_CASES = [ + ('\x1b]66;w=3;ABC\x07', 0, 3, '\x1b]66;w=3;ABC\x07'), + ('\x1b]66;w=3;ABC\x07', 0, 2, ' '), + ('\x1b]66;w=3;ABC\x07', 1, 3, ' '), + ('ab\x1b]66;w=2;XY\x07cd', 0, 6, 'ab\x1b]66;w=2;XY\x07cd'), + ('ab\x1b]66;w=2;XY\x07cd', 0, 3, 'ab '), + ('ab\x1b]66;w=2;XY\x07cd', 4, 6, 'cd'), +] + + +@pytest.mark.parametrize('text,start,end,expected', CLIP_TEXT_SIZING_CASES) +def test_clip_text_sizing(text, start, end, expected): + assert wcwidth.clip(text, start, end) == expected + + +# --- Internal helper --- + +REPLACE_PADDING_CASES = [ + ('\x1b]66;w=3;x\x07', ' '), + ('\x1b]66;s=2:w=2;AB\x07', ' '), + ('abc\x1b]66;w=1;x\x07def', 'abc def'), + ('no text sizing here', 'no text sizing here'), +] + + +@pytest.mark.parametrize('text,expected', REPLACE_PADDING_CASES) +def test_replace_text_sizing_with_padding(text, expected): + assert _replace_text_sizing_with_padding(text) == expected diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index c24bbe7..5a2cfc9 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -30,10 +30,8 @@ from .grapheme import iter_graphemes, iter_graphemes_reverse from .textwrap import SequenceTextWrapper, wrap from .sgr_state import propagate_sgr -from .osc66 import (OSC66Metadata, - parse_osc66_sequence, - osc66_wrap, - osc66_scale) +from .text_sizing import (TextSizingParams, + parse_text_sizing) # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". @@ -41,8 +39,7 @@ 'iter_graphemes_reverse', 'grapheme_boundary_before', 'ljust', 'rjust', 'center', 'wrap', 'clip', 'strip_sequences', 'list_versions', 'propagate_sgr', - 'OSC66Metadata', 'parse_osc66_sequence', 'osc66_wrap', - 'osc66_scale') + 'TextSizingParams', 'parse_text_sizing') # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index 9478694..610f1aa 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -8,10 +8,10 @@ # std imports import re -# OSC 66 (Kitty Text Sizing Protocol) — has positive width, must be checked before ZERO_WIDTH_PATTERN. +# Text Sizing Protocol (OSC 66) — has positive width, must be checked before ZERO_WIDTH_PATTERN. # Groups: (1) metadata, (2) inner text, (3) terminator (BEL or ST). # https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ -OSC66_PATTERN = re.compile( +TEXT_SIZING_PATTERN = re.compile( r'\x1b\]66;([^;\x07\x1b]*);([^\x07\x1b]*)(\x07|\x1b\\)' ) diff --git a/wcwidth/osc66.py b/wcwidth/osc66.py deleted file mode 100644 index 8d120eb..0000000 --- a/wcwidth/osc66.py +++ /dev/null @@ -1,280 +0,0 @@ -r""" -OSC 66 (Kitty Text Sizing Protocol) parsing and generation. - -The `Kitty Text Sizing Protocol`_ allows applications to explicitly tell -terminals how many cells text occupies, using the escape sequence:: - - ESC ] 66 ; metadata ; text BEL/ST - -Metadata is colon-separated ``key=value`` pairs: - -- ``s``: scale (1--7, default 1) -- ``w``: width in cells (0--7, default 0; 0 means auto-calculate from inner text) -- ``n``: fractional numerator (0--15, default 0) -- ``d``: fractional denominator (0--15, default 0) -- ``v``: vertical alignment (0--2, default 0: top, 1: bottom, 2: center) -- ``h``: horizontal alignment (0--2, default 0: left, 1: right, 2: center) - -Width calculation: if ``w > 0``, the sequence occupies ``s * w`` cells. -If ``w == 0``, the sequence occupies ``s * inner_text_width`` cells. - -.. _`Kitty Text Sizing Protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ - -.. versionadded:: 0.6.0 -""" -from __future__ import annotations - -from typing import NamedTuple - -from .escape_sequences import OSC66_PATTERN - -_MAX_TEXT_PAYLOAD = 4096 - -# Metadata key → (NamedTuple field, min, max, default) -_META_FIELDS = { - 's': ('scale', 1, 7, 1), - 'w': ('width', 0, 7, 0), - 'n': ('numerator', 0, 15, 0), - 'd': ('denominator', 0, 15, 0), - 'v': ('vertical_align', 0, 2, 0), - 'h': ('horizontal_align', 0, 2, 0), -} - -# Reverse map: field name → short key -_FIELD_TO_KEY = {field: key for key, (field, _, _, _) in _META_FIELDS.items()} - - -class OSC66Metadata(NamedTuple): - """Parsed metadata from an OSC 66 escape sequence. - - :param scale: Scale factor (1--7). Text occupies ``scale`` rows tall - and ``scale * width`` columns wide. - :param width: Width in cells (0--7). When 0, width is auto-calculated - from the inner text. - :param numerator: Fractional scaling numerator (0--15). - :param denominator: Fractional scaling denominator (0--15). - :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). - :param horizontal_align: Horizontal alignment (0=left, 1=right, 2=center). - """ - - scale: int = 1 - width: int = 0 - numerator: int = 0 - denominator: int = 0 - vertical_align: int = 0 - horizontal_align: int = 0 - - -def parse_osc66_metadata(raw: str) -> OSC66Metadata: - """Parse colon-separated ``key=value`` metadata string. - - :param raw: Metadata string, e.g. ``'s=2:w=3'``. - :returns: Parsed metadata with values clamped to valid ranges. - Unknown keys are ignored. Non-integer values use defaults. - - Example:: - - >>> parse_osc66_metadata('s=2:w=3') - OSC66Metadata(scale=2, width=3, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) - >>> parse_osc66_metadata('') - OSC66Metadata(scale=1, width=0, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) - """ - kwargs: dict[str, int] = {} - if not raw: - return OSC66Metadata() - for part in raw.split(':'): - if '=' not in part: - continue - key, _, val_str = part.partition('=') - if key not in _META_FIELDS: - continue - field, lo, hi, default = _META_FIELDS[key] - try: - kwargs[field] = max(lo, min(hi, int(val_str))) - except (ValueError, OverflowError): - kwargs[field] = default - return OSC66Metadata(**kwargs) - - -def make_osc66_metadata(meta: OSC66Metadata) -> str: - """Serialize metadata, omitting fields at their default values. - - :param meta: Metadata to serialize. - :returns: Colon-separated ``key=value`` string. - - Example:: - - >>> make_osc66_metadata(OSC66Metadata(scale=2, width=3)) - 's=2:w=3' - >>> make_osc66_metadata(OSC66Metadata()) - '' - """ - parts = [] - defaults = OSC66Metadata() - for field, key in _FIELD_TO_KEY.items(): - val = getattr(meta, field) - if val != getattr(defaults, field): - parts.append(f'{key}={val}') - return ':'.join(parts) - - -def parse_osc66_sequence(seq: str) -> tuple[OSC66Metadata, str, str] | None: - """Parse a complete OSC 66 escape sequence. - - :param seq: Full escape sequence string. - :returns: Tuple of ``(metadata, inner_text, terminator)`` or ``None`` - if the string is not a valid OSC 66 sequence. - - Example:: - - >>> parse_osc66_sequence('\x1b]66;s=2;hello\x07') - (OSC66Metadata(scale=2, ...), 'hello', '\x07') - >>> parse_osc66_sequence('\x1b[31m') is None - True - """ - match = OSC66_PATTERN.fullmatch(seq) - if not match: - return None - return ( - parse_osc66_metadata(match.group(1)), - match.group(2), - match.group(3), - ) - - -def osc66_width( - meta: OSC66Metadata, - inner_text: str, - ambiguous_width: int = 1, -) -> int: - """Calculate the display width of an OSC 66 sequence. - - :param meta: Parsed metadata. - :param inner_text: The text payload of the OSC 66 sequence. - :param ambiguous_width: Width for East Asian Ambiguous characters. - :returns: Display width in terminal cells. - - When ``meta.width > 0``, returns ``meta.scale * meta.width``. - When ``meta.width == 0``, returns ``meta.scale * measured_inner_width``. - """ - if meta.width > 0: - return meta.scale * meta.width - # Lazy import to avoid circular dependency (wcwidth -> osc66 -> wcwidth) - from .wcwidth import wcswidth # pylint: disable=import-outside-toplevel - inner_w = wcswidth(inner_text, ambiguous_width=ambiguous_width) - return meta.scale * max(0, inner_w) - - -def make_osc66_sequence( - text: str, - meta: OSC66Metadata, - terminator: str = '\x07', -) -> str: - r"""Build a complete OSC 66 escape sequence. - - :param text: Text payload. - :param meta: Metadata to encode. - :param terminator: Sequence terminator, ``'\x07'`` (BEL) or - ``'\x1b\\'`` (ST). Default is BEL. - :returns: Complete escape sequence string. - :raises ValueError: If text exceeds 4096 bytes when UTF-8 encoded. - - Example:: - - >>> make_osc66_sequence('hi', OSC66Metadata(scale=2, width=1)) - '\x1b]66;s=2:w=1;hi\x07' - """ - if len(text.encode('utf-8')) > _MAX_TEXT_PAYLOAD: - raise ValueError( - f"OSC 66 text payload exceeds {_MAX_TEXT_PAYLOAD} byte limit" - ) - metadata_str = make_osc66_metadata(meta) - return f'\x1b]66;{metadata_str};{text}{terminator}' - - -def osc66_wrap( - text: str, - *, - scale: int = 1, - width: int = 0, - numerator: int = 0, - denominator: int = 0, - vertical_align: int = 0, - horizontal_align: int = 0, - terminator: str = '\x07', -) -> str: - r"""Wrap text in an OSC 66 escape sequence with full control over metadata. - - :param text: Text payload. - :param scale: Scale factor (1--7). - :param width: Width in cells (0--7). 0 means auto-calculate. - :param numerator: Fractional scaling numerator (0--15). - :param denominator: Fractional scaling denominator (0--15). - :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). - :param horizontal_align: Horizontal alignment (0=left, 1=right, 2=center). - :param terminator: ``'\x07'`` (BEL) or ``'\x1b\\'`` (ST). - :returns: Complete OSC 66 escape sequence. - :raises ValueError: If text exceeds 4096 bytes. - - Example:: - - >>> osc66_wrap('AB', scale=2, width=2) - '\x1b]66;s=2:w=2;AB\x07' - """ - meta = OSC66Metadata( - scale=scale, - width=width, - numerator=numerator, - denominator=denominator, - vertical_align=vertical_align, - horizontal_align=horizontal_align, - ) - return make_osc66_sequence(text, meta, terminator) - - -def osc66_scale( - text: str, - scale: int, - *, - terminator: str = '\x07', - ambiguous_width: int = 1, -) -> str: - r"""Wrap text in an OSC 66 sequence, auto-calculating width from inner text. - - This is the most common use case: scale text to ``scale`` times its - natural width, with the ``w`` parameter set automatically. - - :param text: Text payload. - :param scale: Scale factor (1--7). - :param terminator: ``'\x07'`` (BEL) or ``'\x1b\\'`` (ST). - :param ambiguous_width: Width for East Asian Ambiguous characters. - :returns: Complete OSC 66 escape sequence with auto-calculated ``w``. - :raises ValueError: If text exceeds 4096 bytes. - - Example:: - - >>> osc66_scale('AB', 2) - '\x1b]66;s=2:w=2;AB\x07' - """ - from .wcwidth import wcswidth # pylint: disable=import-outside-toplevel - inner_w = wcswidth(text, ambiguous_width=ambiguous_width) - meta = OSC66Metadata(scale=scale, width=max(0, inner_w)) - return make_osc66_sequence(text, meta, terminator) - - -def _replace_osc66_with_padding( - text: str, - ambiguous_width: int = 1, -) -> str: - """Replace each OSC 66 sequence with spaces matching its declared width. - - Used internally by ``_width_ignored_codes`` to account for OSC 66 - width before stripping other sequences. - """ - def _replacer(match: 're.Match[str]') -> str: - meta = parse_osc66_metadata(match.group(1)) - inner_text = match.group(2) - w = osc66_width(meta, inner_text, ambiguous_width) - return ' ' * w - - return OSC66_PATTERN.sub(_replacer, text) diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py new file mode 100644 index 0000000..bdba939 --- /dev/null +++ b/wcwidth/text_sizing.py @@ -0,0 +1,155 @@ +r""" +Text Sizing Protocol (OSC 66) parsing and measurement. + +The `Kitty Text Sizing Protocol`_ allows applications to explicitly tell +terminals how many cells text occupies, using the escape sequence:: + + ESC ] 66 ; metadata ; text BEL/ST + +Metadata is colon-separated ``key=value`` pairs: + +- ``s``: scale (1--7, default 1) +- ``w``: width in cells (0--7, default 0; 0 means auto-calculate from inner text) +- ``n``: fractional numerator (0--15, default 0) +- ``d``: fractional denominator (0--15, default 0) +- ``v``: vertical alignment (0--2, default 0: top, 1: bottom, 2: center) +- ``h``: horizontal alignment (0--2, default 0: left, 1: right, 2: center) + +Width calculation: if ``w > 0``, the sequence occupies ``s * w`` cells. +If ``w == 0``, the sequence occupies ``s * inner_text_width`` cells. + +.. _`Kitty Text Sizing Protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ + +.. versionadded:: 0.6.0 +""" +from __future__ import annotations + +from typing import NamedTuple + +from .escape_sequences import TEXT_SIZING_PATTERN + +# Metadata key → (NamedTuple field, min, max, default) +_META_FIELDS = { + 's': ('scale', 1, 7, 1), + 'w': ('width', 0, 7, 0), + 'n': ('numerator', 0, 15, 0), + 'd': ('denominator', 0, 15, 0), + 'v': ('vertical_align', 0, 2, 0), + 'h': ('horizontal_align', 0, 2, 0), +} + +class TextSizingParams(NamedTuple): + """Parsed parameters from a text sizing escape sequence (OSC 66). + + :param scale: Scale factor (1--7). Text occupies ``scale`` rows tall + and ``scale * width`` columns wide. + :param width: Width in cells (0--7). When 0, width is auto-calculated + from the inner text. + :param numerator: Fractional scaling numerator (0--15). + :param denominator: Fractional scaling denominator (0--15). + :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). + :param horizontal_align: Horizontal alignment (0=left, 1=right, 2=center). + """ + + scale: int = 1 + width: int = 0 + numerator: int = 0 + denominator: int = 0 + vertical_align: int = 0 + horizontal_align: int = 0 + + +def parse_text_sizing_params(raw: str) -> TextSizingParams: + """Parse colon-separated ``key=value`` metadata string. + + :param raw: Metadata string, e.g. ``'s=2:w=3'``. + :returns: Parsed parameters with values clamped to valid ranges. + Unknown keys are ignored. Non-integer values use defaults. + + Example:: + + >>> parse_text_sizing_params('s=2:w=3') + TextSizingParams(scale=2, width=3, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) + >>> parse_text_sizing_params('') + TextSizingParams(scale=1, width=0, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) + """ + kwargs: dict[str, int] = {} + if not raw: + return TextSizingParams() + for part in raw.split(':'): + if '=' not in part: + continue + key, _, val_str = part.partition('=') + if key not in _META_FIELDS: + continue + field, lo, hi, default = _META_FIELDS[key] + try: + kwargs[field] = max(lo, min(hi, int(val_str))) + except (ValueError, OverflowError): + kwargs[field] = default + return TextSizingParams(**kwargs) + + +def parse_text_sizing(seq: str) -> tuple[TextSizingParams, str, str] | None: + """Parse a complete text sizing escape sequence (OSC 66). + + :param seq: Full escape sequence string. + :returns: Tuple of ``(params, inner_text, terminator)`` or ``None`` + if the string is not a valid text sizing sequence. + + Example:: + + >>> parse_text_sizing('\x1b]66;s=2;hello\x07') + (TextSizingParams(scale=2, ...), 'hello', '\x07') + >>> parse_text_sizing('\x1b[31m') is None + True + """ + match = TEXT_SIZING_PATTERN.fullmatch(seq) + if not match: + return None + return ( + parse_text_sizing_params(match.group(1)), + match.group(2), + match.group(3), + ) + + +def text_sizing_width( + params: TextSizingParams, + inner_text: str, + ambiguous_width: int = 1, +) -> int: + """Calculate the display width of a text sizing sequence. + + :param params: Parsed parameters. + :param inner_text: The text payload of the sequence. + :param ambiguous_width: Width for East Asian Ambiguous characters. + :returns: Display width in terminal cells. + + When ``params.width > 0``, returns ``params.scale * params.width``. + When ``params.width == 0``, returns ``params.scale * measured_inner_width``. + """ + if params.width > 0: + return params.scale * params.width + # Lazy import to avoid circular dependency (wcwidth -> text_sizing -> wcwidth) + from .wcwidth import wcswidth # pylint: disable=import-outside-toplevel + inner_w = wcswidth(inner_text, ambiguous_width=ambiguous_width) + return params.scale * max(0, inner_w) + + +def _replace_text_sizing_with_padding( + text: str, + ambiguous_width: int = 1, +) -> str: + """Replace each text sizing sequence with spaces matching its declared width. + + Used internally by ``_width_ignored_codes`` to account for text sizing + width before stripping other sequences. + """ + def _replacer(match: 're.Match[str]') -> str: + params = parse_text_sizing_params(match.group(1)) + inner_text = match.group(2) + w = text_sizing_width(params, inner_text, ambiguous_width) + return ' ' * w + + return TEXT_SIZING_PATTERN.sub(_replacer, text) diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 6719954..50c4f7d 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -84,13 +84,13 @@ from .table_grapheme import ISC_CONSONANT, EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR from .table_ambiguous import AMBIGUOUS_EASTASIAN from .escape_sequences import (ZERO_WIDTH_PATTERN, - OSC66_PATTERN, + TEXT_SIZING_PATTERN, CURSOR_LEFT_SEQUENCE, CURSOR_RIGHT_SEQUENCE, INDETERMINATE_EFFECT_SEQUENCE) -from .osc66 import (parse_osc66_metadata, - osc66_width as _osc66_width, - _replace_osc66_with_padding) +from .text_sizing import (parse_text_sizing_params, + text_sizing_width as _text_sizing_width, + _replace_text_sizing_with_padding) from .unicode_versions import list_versions if TYPE_CHECKING: # pragma: no cover @@ -470,7 +470,7 @@ def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int: OSC 66 sequences are replaced with padding of correct width before stripping. """ if '\x1b]66;' in text: - text = _replace_osc66_with_padding(text, ambiguous_width) + text = _replace_text_sizing_with_padding(text, ambiguous_width) return wcswidth( strip_sequences(text).translate(_CONTROL_CHAR_TABLE), ambiguous_width=ambiguous_width @@ -586,13 +586,13 @@ def width( if char == '\x1b': # 1a. OSC 66 (text sizing) has positive width — check before zero-width path if text[idx:idx + 5] == '\x1b]66;': - osc66_match = OSC66_PATTERN.match(text, idx) - if osc66_match: - meta = parse_osc66_metadata(osc66_match.group(1)) - current_col += _osc66_width( - meta, osc66_match.group(2), ambiguous_width + ts_match = TEXT_SIZING_PATTERN.match(text, idx) + if ts_match: + meta = parse_text_sizing_params(ts_match.group(1)) + current_col += _text_sizing_width( + meta, ts_match.group(2), ambiguous_width ) - idx = osc66_match.end() + idx = ts_match.end() max_extent = max(max_extent, current_col) continue match = ZERO_WIDTH_PATTERN.match(text, idx) @@ -880,7 +880,7 @@ def strip_sequences(text: str) -> str: 'bold red text' """ if '\x1b]66;' in text: - text = OSC66_PATTERN.sub(r'\2', text) + text = TEXT_SIZING_PATTERN.sub(r'\2', text) return ZERO_WIDTH_PATTERN.sub('', text) @@ -981,17 +981,17 @@ def clip( if char == '\x1b': # OSC 66 (text sizing) has positive width — handle before zero-width path if text[idx:idx + 5] == '\x1b]66;': - osc66_match = OSC66_PATTERN.match(text, idx) - if osc66_match: - meta = parse_osc66_metadata(osc66_match.group(1)) - w = _osc66_width( - meta, osc66_match.group(2), ambiguous_width + ts_match = TEXT_SIZING_PATTERN.match(text, idx) + if ts_match: + meta = parse_text_sizing_params(ts_match.group(1)) + w = _text_sizing_width( + meta, ts_match.group(2), ambiguous_width ) if w == 0: if start <= col < end: - output.append(osc66_match.group()) + output.append(ts_match.group()) elif col >= start and col + w <= end: - output.append(osc66_match.group()) + output.append(ts_match.group()) if propagate_sgr and sgr_at_clip_start is None: sgr_at_clip_start = sgr col += w @@ -1003,7 +1003,7 @@ def clip( col += w else: col += w - idx = osc66_match.end() + idx = ts_match.end() continue if (match := ZERO_WIDTH_PATTERN.match(text, idx)): From 7d08ea6bcfda241f5c0e8b688f195e3257396176 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 10 Apr 2026 22:14:19 -0400 Subject: [PATCH 03/70] don't do all that --- docs/specs.rst | 60 +++++++++----------------------------------------- 1 file changed, 10 insertions(+), 50 deletions(-) diff --git a/docs/specs.rst b/docs/specs.rst index 54d353e..e5f710c 100644 --- a/docs/specs.rst +++ b/docs/specs.rst @@ -8,7 +8,16 @@ This document defines how this Python wcwidth library measures the printable wid a string. This is not meant to an official standard, but as a terse description of the lowest level API functions :func:`wcwidth.wcwidth` and :func:`wcwidth.wcswidth`. -The :func:`wcwidth.iter_graphemes` function is mainly specified by `Unicode Standard Annex #29`_. +The higher level functions :func:`wcwidth.iter_graphemes` function is mainly specified by `Unicode +Standard Annex #29`_. It is designed that :func:`wcwidth.wcswidth` should be used with each result +of smallest atomic "unit" of text yielded by :func:`wcwidth.iter_graphemes`. + +The highest level :func:`wcwidth.width` is Terminal-aware, and no specific specification is +declared or referenced. The default arguments ``control_codes='parse'``, ``tabsize=8``, and +``ambiguous_width=1`` are described only by their docstrings, or specification of related control +codes parsed, such as `Kitty Text Sizing Protocol`_. + +This specification applies only to :func:`wcwidth.wcwidth` and :func:`wcwidth.wcswidth`. Width of -1 ----------- @@ -119,56 +128,7 @@ formation: the font engine merges the consonants into a single ligature glyph. See also: `L2/2023/23107`_ "Proper Complex Script Support in Text Terminals". -OSC 66 (Kitty Text Sizing Protocol) ------------------------------------- - -The `Kitty Text Sizing Protocol`_ (OSC 66) allows applications to explicitly -declare how many terminal cells text occupies, using the escape sequence:: - - ESC ] 66 ; metadata ; text BEL/ST - -Where ``metadata`` is colon-separated ``key=value`` pairs and the terminator -is either BEL (``0x07``) or ST (``ESC \``). - -Metadata parameters: - -- ``s``: Scale factor (1--7, default 1). Text occupies ``s`` rows tall and - ``s * w`` columns wide. -- ``w``: Width in cells (0--7, default 0). When 0, width is auto-calculated - from the inner text. -- ``n``: Fractional scaling numerator (0--15, default 0). -- ``d``: Fractional scaling denominator (0--15, default 0). -- ``v``: Vertical alignment (0=top, 1=bottom, 2=center; default 0). -- ``h``: Horizontal alignment (0=left, 1=right, 2=center; default 0). - -Width calculation by :func:`wcwidth.width`: - -- When ``w > 0``: the sequence occupies exactly ``s * w`` cells, regardless - of the inner text content. -- When ``w == 0``: the sequence occupies ``s * inner_text_width`` cells, where - ``inner_text_width`` is the measured width of the text payload. - -The fractional scaling parameters (``n`` and ``d``) adjust the rendered font -size within the allocated cells but do not change the cell count. - -OSC 66 sequences are handled in all ``control_codes`` modes (``'parse'``, -``'strict'``, ``'ignore'``), since they declare explicit width rather than -causing indeterminate cursor movement. - -:func:`wcwidth.strip_sequences` extracts the inner text payload from OSC 66 -sequences while stripping the escape wrapper. - -:func:`wcwidth.clip` treats each OSC 66 sequence as an atomic unit of its -declared width. If the sequence straddles a clip boundary, it is replaced -with fill characters. - -Sequence generation (emitting OSC 66) is handled by terminal libraries such -as ``blessed``, not by this width-measurement library. - -See also: `Kitty Text Sizing Protocol`_. - .. _`Kitty Text Sizing Protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ - .. _`U+0000`: https://codepoints.net/U+0000 .. _`U+0001`: https://codepoints.net/U+0001 .. _`U+001F`: https://codepoints.net/U+001F From a33b40188e61011083e1c5c4ddd740c7629fbec4 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Sat, 11 Apr 2026 13:31:11 -0400 Subject: [PATCH 04/70] readme example --- docs/intro.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/intro.rst b/docs/intro.rst index fec7bab..0f1e762 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -133,6 +133,9 @@ Use function `width()`_ to measure a string with improved handling of ``control_ >>> # as well as sequences with "indeterminate" effects like Home + Clear >>> wcwidth.width('\x1b[H\x1b[2J') 0 + >>> # Kitty text sizing protocol (OSC 66): 2x-scaled "Hello" occupies 10 cells + >>> wcwidth.width('\x1b]66;s=2;Hello\x07') + 10 >>> # or, raise ValueError for "indeterminate" effects using control_codes='strict' >>> wcwidth.width('\n', control_codes='strict') Traceback (most recent call last): From 8db11b2a6f87ec1c593912c5492f9da249e72471 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Tue, 14 Apr 2026 19:54:55 -0400 Subject: [PATCH 05/70] Briefly mention kitty sizing --- wcwidth/wcwidth.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 50c4f7d..d9e33d9 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -493,9 +493,10 @@ def width( :param text: String to measure. :param control_codes: How to handle control characters and sequences: - - ``'parse'`` (default): Track horizontal cursor movement from BS ``\b``, CR ``\r``, TAB - ``\t``, and cursor left and right movement sequences. Vertical movement (LF, VT, FF) and - indeterminate sequences are zero-width. Never raises. + - ``'parse'`` (default): Track horizontal cursor movement like BS ``\b``, CR ``\r``, TAB + ``\t``, cursor left and right movement sequences. Vertical movement (LF, VT, FF) and + indeterminate terminal sequences are zero-width. OSC 66 Kitty Text Sizing protocol, OSC 8 + Hyperlink, and many other kinds of output sequences are parsed for displayed measurements. - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with indeterminate results of the screen or cursor, like clear or vertical movement. Generally, these should be handled with a virtual terminal emulator (like 'pyte'). From 39f3b3381681f1755d07b8c89b4e2e9620aecb57 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Sun, 19 Apr 2026 11:37:13 -0400 Subject: [PATCH 06/70] broken! not complete! --- wcwidth/text_sizing.py | 42 +++++++++++++++++++++--------------------- wcwidth/wcwidth.py | 21 +++++++++++++-------- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py index bdba939..d870c84 100644 --- a/wcwidth/text_sizing.py +++ b/wcwidth/text_sizing.py @@ -1,24 +1,26 @@ r""" -Text Sizing Protocol (OSC 66) parsing and measurement. +`Kitty Text Sizing Protocol`_ (OSC 66) parsing and measurement. -The `Kitty Text Sizing Protocol`_ allows applications to explicitly tell +The `Kitty Text Sizing Protocol`_ allows terminal apps to explicitly tell terminals how many cells text occupies, using the escape sequence:: ESC ] 66 ; metadata ; text BEL/ST Metadata is colon-separated ``key=value`` pairs: -- ``s``: scale (1--7, default 1) -- ``w``: width in cells (0--7, default 0; 0 means auto-calculate from inner text) -- ``n``: fractional numerator (0--15, default 0) -- ``d``: fractional denominator (0--15, default 0) -- ``v``: vertical alignment (0--2, default 0: top, 1: bottom, 2: center) -- ``h``: horizontal alignment (0--2, default 0: left, 1: right, 2: center) +- ``s``: scale +- ``w``: width in cells +- ``n``: fractional numerator +- ``d``: fractional denominator +- ``v``: vertical alignment +- ``h``: horizontal alignment -Width calculation: if ``w > 0``, the sequence occupies ``s * w`` cells. -If ``w == 0``, the sequence occupies ``s * inner_text_width`` cells. +Parsing is pretty straight-forward: -.. _`Kitty Text Sizing Protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ +- When ``w > 0``, return ``s * w``. +- Otherwise ``w == 0``, ``s * wcswidth(inner_text_width)`` cells. + +.. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ .. versionadded:: 0.6.0 """ @@ -26,9 +28,9 @@ from typing import NamedTuple -from .escape_sequences import TEXT_SIZING_PATTERN +from .escape_sequences import text_sizing_pattern -# Metadata key → (NamedTuple field, min, max, default) +# Metadata key (field, min, max, default) _META_FIELDS = { 's': ('scale', 1, 7, 1), 'w': ('width', 0, 7, 0), @@ -83,10 +85,7 @@ def parse_text_sizing_params(raw: str) -> TextSizingParams: if key not in _META_FIELDS: continue field, lo, hi, default = _META_FIELDS[key] - try: - kwargs[field] = max(lo, min(hi, int(val_str))) - except (ValueError, OverflowError): - kwargs[field] = default + kwargs[field] = max(lo, min(hi, int(val_str))) return TextSizingParams(**kwargs) @@ -101,8 +100,8 @@ def parse_text_sizing(seq: str) -> tuple[TextSizingParams, str, str] | None: >>> parse_text_sizing('\x1b]66;s=2;hello\x07') (TextSizingParams(scale=2, ...), 'hello', '\x07') - >>> parse_text_sizing('\x1b[31m') is None - True + >>> parse_text_sizing('\x1b[31m') + None """ match = TEXT_SIZING_PATTERN.fullmatch(seq) if not match: @@ -131,8 +130,9 @@ def text_sizing_width( """ if params.width > 0: return params.scale * params.width - # Lazy import to avoid circular dependency (wcwidth -> text_sizing -> wcwidth) - from .wcwidth import wcswidth # pylint: disable=import-outside-toplevel + # Lazy import to avoid circular dependency + # pylint: disable=import-outside-toplevel + from .wcwidth import wcswidth inner_w = wcswidth(inner_text, ambiguous_width=ambiguous_width) return params.scale * max(0, inner_w) diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index d9e33d9..7e3f61a 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -467,10 +467,7 @@ def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int: Fast path for width() with control_codes='ignore'. Strips escape sequences and control characters, then measures remaining text. - OSC 66 sequences are replaced with padding of correct width before stripping. """ - if '\x1b]66;' in text: - text = _replace_text_sizing_with_padding(text, ambiguous_width) return wcswidth( strip_sequences(text).translate(_CONTROL_CHAR_TABLE), ambiguous_width=ambiguous_width @@ -502,8 +499,8 @@ def width( these should be handled with a virtual terminal emulator (like 'pyte'). - ``'ignore'``: All C0 and C1 control characters and escape sequences are measured as width 0. This is the fastest measurement for text already filtered or known not to contain - any kinds of control codes or sequences. TAB ``\t`` is zero-width; for tab expansion, - pre-process: ``text.replace('\t', ' ' * 8)``. + any kinds of control codes or sequences. TAB ``\t`` is zero-width; to ensure + tab expansion, pre-process text using :func:`str.expandtabs`. :param tabsize: Tab stop width for ``'parse'`` and ``'strict'`` modes. Default is 8. Must be positive. Has no effect when ``control_codes='ignore'``. @@ -548,7 +545,7 @@ def width( return len(text) # Fast parse: if no horizontal cursor movements are possible, switch to 'ignore' mode. - # Only check for longer strings - the detection overhead hurts short string performance. + # Only check longer strings - the detection overhead hurts short string performance. if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN: # Check for cursor-affecting control characters if '\b' not in text and '\t' not in text and '\r' not in text: @@ -559,7 +556,7 @@ def width( ): control_codes = 'ignore' - # Fast path for ignore mode -- this is useful if you know the text is already "clean" + # Fast path for ignore mode, useful if you know the text is already free of control codes if control_codes == 'ignore': return _width_ignored_codes(text, ambiguous_width) @@ -585,7 +582,7 @@ def width( # 1. Handle ESC sequences if char == '\x1b': - # 1a. OSC 66 (text sizing) has positive width — check before zero-width path + # 1a. OSC 66 (kitty text sizing) positive width if text[idx:idx + 5] == '\x1b]66;': ts_match = TEXT_SIZING_PATTERN.match(text, idx) if ts_match: @@ -596,6 +593,7 @@ def width( idx = ts_match.end() max_extent = max(max_extent, current_col) continue + # 1b. Check all other "zero-width" terminal sequences match = ZERO_WIDTH_PATTERN.match(text, idx) if match: seq = match.group() @@ -864,6 +862,9 @@ def strip_sequences(text: str) -> str: r""" Return text with all terminal escape sequences removed. + For sequences containing printable text, OSC 66 (Text sizing protocol) and OSC 8 (hyperlink), + the inner text is preserved. + Unknown or incomplete ESC sequences are preserved. :param text: String that may contain terminal escape sequences. @@ -879,6 +880,10 @@ def strip_sequences(text: str) -> str: 'hello' >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text') 'bold red text' + >>> strip_sequences('\x1b]66;s=2;hello\x07') + 'hello' + >>> strip_sequences('\x1b]8;id=34;https://example.com\x1b\\[view]\x1b]8;;\x1b\\') + '[view]' """ if '\x1b]66;' in text: text = TEXT_SIZING_PATTERN.sub(r'\2', text) From 43c298ed401c0e6e1582881f17f4868702c2f044 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 23 Apr 2026 21:44:35 -0400 Subject: [PATCH 07/70] still pecking away at this .. --- docs/specs.rst | 26 ++++++++----- tests/test_text_sizing.py | 49 +++++++++++++++++------- wcwidth/__init__.py | 3 +- wcwidth/text_sizing.py | 79 +++++++++++++++++++++++++++------------ wcwidth/wcwidth.py | 10 ++--- 5 files changed, 113 insertions(+), 54 deletions(-) diff --git a/docs/specs.rst b/docs/specs.rst index e5f710c..6a144a4 100644 --- a/docs/specs.rst +++ b/docs/specs.rst @@ -6,18 +6,23 @@ Specification This document defines how this Python wcwidth library measures the printable width of characters of a string. This is not meant to an official standard, but as a terse description of the lowest level -API functions :func:`wcwidth.wcwidth` and :func:`wcwidth.wcswidth`. +API functions :func:`wcwidth.wcwidth` and :func:`wcwidth.wcswidth` and its relation to higher level +API function :func:`wcwidth.wcwidth`. -The higher level functions :func:`wcwidth.iter_graphemes` function is mainly specified by `Unicode -Standard Annex #29`_. It is designed that :func:`wcwidth.wcswidth` should be used with each result -of smallest atomic "unit" of text yielded by :func:`wcwidth.iter_graphemes`. +Scope +----- -The highest level :func:`wcwidth.width` is Terminal-aware, and no specific specification is -declared or referenced. The default arguments ``control_codes='parse'``, ``tabsize=8``, and -``ambiguous_width=1`` are described only by their docstrings, or specification of related control -codes parsed, such as `Kitty Text Sizing Protocol`_. +The lowest level functions :func:`wcwidth.wcwidth` and :func:`wcwidth.wcswidth` return -1 when any +control codes are present. The higher level function :func:`wcwidth.width` never returns -1, +accepting default arguments, ``control_codes='parse'`` and its behavior and options are described by +its docstring and specifications of related control codes, `XTerm Control Sequences`_ and `Kitty +Text Sizing Protocol`_. -This specification applies only to :func:`wcwidth.wcwidth` and :func:`wcwidth.wcswidth`. +:func:`wcwidth.iter_graphemes` is specified by `Unicode Standard Annex #29`_ and each string yielded +by :func:`wcwidth.iter_graphemes` may be mapped to :func:`wcwidth.wcswidth`. Although it matches +behavior of Python 3.15 `uncodedata.iter_graphemes()`_ it differs in its return value, +:func:`wcwidth.iter_graphemes` yields only strings, while :func:`wcwidth.iter_graphemes` yields +``unicodedata.Segment`` class objects. Width of -1 ----------- @@ -128,7 +133,9 @@ formation: the font engine merges the consonants into a single ligature glyph. See also: `L2/2023/23107`_ "Proper Complex Script Support in Text Terminals". +.. _`Hyperlinks in Terminal Emulators`: https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda .. _`Kitty Text Sizing Protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ +.. _`XTerm Control Sequences`: https://invisible-island.net/xterm/ctlseqs/ctlseqs.html .. _`U+0000`: https://codepoints.net/U+0000 .. _`U+0001`: https://codepoints.net/U+0001 .. _`U+001F`: https://codepoints.net/U+001F @@ -174,3 +181,4 @@ See also: `L2/2023/23107`_ "Proper Complex Script Support in Text Terminals". .. _`aksara`: https://www.unicode.org/glossary/#aksara .. _`L2/2023/23107`: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf .. _`Unicode Standard Annex #29`: https://www.unicode.org/reports/tr29/ +.. _`uncodedata.iter_graphemes()`: https://docs.python.org/3.15/library/unicodedata.html#unicodedata.iter_graphemes diff --git a/tests/test_text_sizing.py b/tests/test_text_sizing.py index 8ba6603..d479159 100644 --- a/tests/test_text_sizing.py +++ b/tests/test_text_sizing.py @@ -4,14 +4,12 @@ # local import wcwidth -from wcwidth.text_sizing import ( - TextSizingParams, - parse_text_sizing_params, - parse_text_sizing, - text_sizing_width, - _replace_text_sizing_with_padding, -) - +from wcwidth.text_sizing import (TextSizingParams, + parse_text_sizing, + text_sizing_width, + parse_text_sizing_params, + _replace_text_sizing_with_padding, + _META_FIELDS) # -- Test-only helpers for generating OSC 66 sequences -- @@ -79,18 +77,20 @@ def test_parse_text_sizing_params_clamp(raw, expected): PARSE_PARAMS_EDGE_CASES = [ ('unknown=5', TextSizingParams()), ('s=2:unknown=5:w=3', TextSizingParams(scale=2, width=3)), - ('s=abc', TextSizingParams()), - ('s=', TextSizingParams()), ('noequalssign', TextSizingParams()), ('s=2:w=3:', TextSizingParams(scale=2, width=3)), (':s=2', TextSizingParams(scale=2)), ] - @pytest.mark.parametrize('raw,expected', PARSE_PARAMS_EDGE_CASES) def test_parse_text_sizing_params_edge(raw, expected): assert parse_text_sizing_params(raw) == expected +# TODO: assert ValueError for all such values, when control_codes='parse', ignore, +# when control_codes='strict', raise decorated ValueError, 's', 'w', etc. +# ('s=', TextSizingParams()), +# ('s=abc', TextSizingParams()), + PARAMS_ROUNDTRIP_CASES = [ TextSizingParams(), @@ -110,6 +110,10 @@ def test_params_roundtrip(params): PARSE_SEQUENCE_CASES = [ ('\x1b]66;s=2;hello\x07', (TextSizingParams(scale=2), 'hello', '\x07')), + ('\x1b]66;s=99;hello\x07', + (TextSizingParams(scale=_META_FIELDS['s'].high), 'hello', '\x07')), + ('\x1b]66;s=-99;hello\x07', + (TextSizingParams(scale=_META_FIELDS['s'].low), 'hello', '\x07')), ('\x1b]66;s=2;hello\x1b\\', (TextSizingParams(scale=2), 'hello', '\x1b\\')), ('\x1b]66;;text\x07', @@ -118,6 +122,8 @@ def test_params_roundtrip(params): (TextSizingParams(scale=3, width=2), '', '\x07')), ('\x1b]66;w=5;AB\x07', (TextSizingParams(width=5), 'AB', '\x07')), + ('\x1b]66;s=7;' + ('X' * 30) + '\x07', + (TextSizingParams(scale=7), 'X' * 30, '\x07')), ] @@ -133,6 +139,7 @@ def test_parse_text_sizing(seq, expected): 'plain text', '', '\x1b]66;missing_second_semi\x07', + ] @@ -224,7 +231,7 @@ def test_scale_st_terminator(): ('\x1b]66;w=3;x\x07', 3), ('\x1b]66;s=1:w=0;AB\x07', 2), ('\x1b]66;s=2:w=0;AB\x07', 4), - ('\x1b]66;s=2:w=0;\u4e2d\x07', 4), + ('\x1b]66;s=2:w=0;\u4e2d\x07', 4), # '中' ('\x1b]66;s=1:w=0;\x07', 0), ('abc\x1b]66;w=3;x\x07def', 9), ('\x1b]66;w=2;A\x07\x1b]66;w=3;B\x07', 5), @@ -238,8 +245,22 @@ def test_width_text_sizing_parse(text, expected): assert wcwidth.width(text) == expected -@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) -def test_width_text_sizing_ignore(text, expected): +WIDTH_PARSE_IGNORED_CASES = [ + # when ignored, only the 'inner text' width is measured + ('\x1b]66;s=2:w=3;anything\x07', 8), + ('\x1b]66;w=3;x\x07', 1), + ('\x1b]66;s=1:w=0;AB\x07', 2), + ('\x1b]66;s=2:w=0;AB\x07', 2), + ('\x1b]66;s=2:w=0;\u4e2d\x07', 2), # '中' + ('\x1b]66;s=1:w=0;\x07', 0), + ('abc\x1b]66;w=3;x\x07def', 7), + ('\x1b]66;w=2;A\x07\x1b]66;w=3;B\x07', 2), + ('\x1b]66;s=2:w=3;text\x1b\\', 4), + ('\x1b[31m\x1b]66;w=2;AB\x07\x1b[0m', 2), +] + +@pytest.mark.parametrize('text,expected', WIDTH_PARSE_IGNORED_CASES) +def test_width_text_sizing_ignored(text, expected): assert wcwidth.width(text, control_codes='ignore') == expected diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 5a2cfc9..7cd68e6 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -30,8 +30,7 @@ from .grapheme import iter_graphemes, iter_graphemes_reverse from .textwrap import SequenceTextWrapper, wrap from .sgr_state import propagate_sgr -from .text_sizing import (TextSizingParams, - parse_text_sizing) +from .text_sizing import TextSizingParams, parse_text_sizing # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py index d870c84..1c9d310 100644 --- a/wcwidth/text_sizing.py +++ b/wcwidth/text_sizing.py @@ -26,29 +26,41 @@ """ from __future__ import annotations +import typing from typing import NamedTuple -from .escape_sequences import text_sizing_pattern - -# Metadata key (field, min, max, default) -_META_FIELDS = { - 's': ('scale', 1, 7, 1), - 'w': ('width', 0, 7, 0), - 'n': ('numerator', 0, 15, 0), - 'd': ('denominator', 0, 15, 0), - 'v': ('vertical_align', 0, 2, 0), - 'h': ('horizontal_align', 0, 2, 0), +# local +from .escape_sequences import TEXT_SIZING_PATTERN + +if typing.TYPE_CHECKING: # pragma: no cover + # std imports + import re + +class _MetaField(NamedTuple): + name: str + low: int + high: int + +_META_FIELDS: dict[str, MetaField] = { + 's': _MetaField('scale', low=1, high=7), + 'w': _MetaField('width', low=0, high=7), + 'n': _MetaField('numerator', low=0, high=15), + 'd': _MetaField('denominator', low=0, high=15), + 'v': _MetaField('vertical_align', low=0, high=2), + 'h': _MetaField('horizontal_align', low=0, high=2), } + class TextSizingParams(NamedTuple): - """Parsed parameters from a text sizing escape sequence (OSC 66). + """ + Parsed parameters from a text sizing escape sequence (OSC 66). - :param scale: Scale factor (1--7). Text occupies ``scale`` rows tall + :param scale: Scale factor (1-7). Text occupies ``scale`` rows tall and ``scale * width`` columns wide. - :param width: Width in cells (0--7). When 0, width is auto-calculated + :param width: Width in cells (0-7). When 0, width is auto-calculated from the inner text. - :param numerator: Fractional scaling numerator (0--15). - :param denominator: Fractional scaling denominator (0--15). + :param numerator: Fractional scaling numerator (0-15). + :param denominator: Fractional scaling denominator (0-15). :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). :param horizontal_align: Horizontal alignment (0=left, 1=right, 2=center). """ @@ -61,10 +73,14 @@ class TextSizingParams(NamedTuple): horizontal_align: int = 0 -def parse_text_sizing_params(raw: str) -> TextSizingParams: - """Parse colon-separated ``key=value`` metadata string. +def parse_text_sizing_params(raw: str, control_codes='parse') -> TextSizingParams: + """ + Parse colon-separated ``key=value`` metadata string. :param raw: Metadata string, e.g. ``'s=2:w=3'``. + :param control_does: 'parse' or 'strict'. + :raises ValueError: If ``control_codes='strict'`` unrecognized text sizing parameters raise + ValueError. :returns: Parsed parameters with values clamped to valid ranges. Unknown keys are ignored. Non-integer values use defaults. @@ -81,16 +97,28 @@ def parse_text_sizing_params(raw: str) -> TextSizingParams: for part in raw.split(':'): if '=' not in part: continue - key, _, val_str = part.partition('=') - if key not in _META_FIELDS: + key, _eq, val = part.partition('=') + field = _META_FIELDS.get(key) + if field is None: + if control_codes == 'strict': + raise ValueError(f"Unknown text sizing field '{key}' in OSC 66 sequence, {raw!r}") + # ignore unknown fields unless 'strict' continue - field, lo, hi, default = _META_FIELDS[key] - kwargs[field] = max(lo, min(hi, int(val_str))) + try: + value = int(val) + except ValueError as exc: + if control_does == 'strict': + raise ValueError(f"Illegal text sizing value '{val}' " + f"in OSC 66 sequence, {raw!r}: {exc}") + # ignore value, using default, unless 'strict' + continue + kwargs[field.name] = max(field.low, min(field.high, value)) return TextSizingParams(**kwargs) def parse_text_sizing(seq: str) -> tuple[TextSizingParams, str, str] | None: - """Parse a complete text sizing escape sequence (OSC 66). + r""" + Parse a complete text sizing escape sequence (OSC 66). :param seq: Full escape sequence string. :returns: Tuple of ``(params, inner_text, terminator)`` or ``None`` @@ -118,7 +146,8 @@ def text_sizing_width( inner_text: str, ambiguous_width: int = 1, ) -> int: - """Calculate the display width of a text sizing sequence. + """ + Calculate the display width of a text sizing sequence. :param params: Parsed parameters. :param inner_text: The text payload of the sequence. @@ -132,6 +161,7 @@ def text_sizing_width( return params.scale * params.width # Lazy import to avoid circular dependency # pylint: disable=import-outside-toplevel + # local from .wcwidth import wcswidth inner_w = wcswidth(inner_text, ambiguous_width=ambiguous_width) return params.scale * max(0, inner_w) @@ -141,7 +171,8 @@ def _replace_text_sizing_with_padding( text: str, ambiguous_width: int = 1, ) -> str: - """Replace each text sizing sequence with spaces matching its declared width. + """ + Replace each text sizing sequence with spaces matching its declared width. Used internally by ``_width_ignored_codes`` to account for text sizing width before stripping other sequences. diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 7e3f61a..d6b915a 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -80,6 +80,8 @@ from .table_vs16 import VS16_NARROW_TO_WIDE from .table_wide import WIDE_EASTASIAN from .table_zero import ZERO_WIDTH +from .text_sizing import text_sizing_width as _text_sizing_width +from .text_sizing import parse_text_sizing_params, _replace_text_sizing_with_padding from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL from .table_grapheme import ISC_CONSONANT, EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR from .table_ambiguous import AMBIGUOUS_EASTASIAN @@ -88,9 +90,6 @@ CURSOR_LEFT_SEQUENCE, CURSOR_RIGHT_SEQUENCE, INDETERMINATE_EFFECT_SEQUENCE) -from .text_sizing import (parse_text_sizing_params, - text_sizing_width as _text_sizing_width, - _replace_text_sizing_with_padding) from .unicode_versions import list_versions if TYPE_CHECKING: # pragma: no cover @@ -549,10 +548,11 @@ def width( if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN: # Check for cursor-affecting control characters if '\b' not in text and '\t' not in text and '\r' not in text: - # Check for escape sequences - if none, or only non-cursor-movement sequences + # Check for escape sequences that can't be ignored, if present if '\x1b' not in text or ( not CURSOR_RIGHT_SEQUENCE.search(text) and - not CURSOR_LEFT_SEQUENCE.search(text) + not CURSOR_LEFT_SEQUENCE.search(text) and + not TEXT_SIZING_PATTERN.search(text) ): control_codes = 'ignore' From bea1c34ff6d48da53081e9160b5324fd96cc2a62 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Mon, 27 Apr 2026 15:11:18 -0400 Subject: [PATCH 08/70] passing tests, now we lint .. --- tests/test_text_sizing.py | 444 +++++++++++++++++--------------------- wcwidth/__init__.py | 5 +- wcwidth/text_sizing.py | 249 +++++++++++---------- wcwidth/textwrap.py | 6 +- wcwidth/wcwidth.py | 72 +++---- 5 files changed, 359 insertions(+), 417 deletions(-) diff --git a/tests/test_text_sizing.py b/tests/test_text_sizing.py index d479159..eb5cbda 100644 --- a/tests/test_text_sizing.py +++ b/tests/test_text_sizing.py @@ -4,227 +4,112 @@ # local import wcwidth -from wcwidth.text_sizing import (TextSizingParams, - parse_text_sizing, - text_sizing_width, - parse_text_sizing_params, - _replace_text_sizing_with_padding, - _META_FIELDS) - -# -- Test-only helpers for generating OSC 66 sequences -- - -_FIELD_TO_KEY = { - 'scale': 's', 'width': 'w', 'numerator': 'n', - 'denominator': 'd', 'vertical_align': 'v', 'horizontal_align': 'h', -} - -_DEFAULTS = TextSizingParams() - - -def _make_params_str(params): - """Serialize TextSizingParams to colon-separated key=value string.""" - parts = [] - for field, key in _FIELD_TO_KEY.items(): - val = getattr(params, field) - if val != getattr(_DEFAULTS, field): - parts.append(f'{key}={val}') - return ':'.join(parts) - - -def _make_seq(text, params=None, terminator='\x07', **kwargs): - """Build a complete OSC 66 escape sequence for testing.""" - if params is None: - params = TextSizingParams(**kwargs) - return f'\x1b]66;{_make_params_str(params)};{text}{terminator}' - - -PARSE_PARAMS_CASES = [ - ('', TextSizingParams()), - ('s=2', TextSizingParams(scale=2)), - ('w=3', TextSizingParams(width=3)), - ('s=2:w=3', TextSizingParams(scale=2, width=3)), - ('s=2:w=3:n=1:d=2:v=1:h=2', - TextSizingParams(scale=2, width=3, numerator=1, denominator=2, - vertical_align=1, horizontal_align=2)), - ('n=5:d=10', TextSizingParams(numerator=5, denominator=10)), - ('v=0:h=0', TextSizingParams()), - ('s=1:w=0', TextSizingParams()), +from wcwidth.text_sizing import TextSizing, TextSizingParams +from wcwidth.escape_sequences import TEXT_SIZING_PATTERN + +CONTROL_CODES_PARAMS_CASES = [ + ('x=2', "", "Unknown text sizing field 'x' in "), + ('s=3:x=3', "s=3", "Unknown text sizing field 'x' in "), + ('s=2:x=3:w=9', "s=2:w=7", "Unknown text sizing field 'x' in "), + ('xyz=2', "", "Unknown text sizing field 'xyz' in "), + ('xxx', "", "Expected '=' in text sizing parameter"), + ('s=xxx', "", "Illegal text sizing value 'xxx' in "), + ('s=-99', "", "Out of bounds text sizing value '-99' in "), + ('s=99', "s=7", "Out of bounds text sizing value '99' in "), + ('w=-1', "", "Out of bounds text sizing value '-1' in "), + ('w=8', "w=7", "Out of bounds text sizing value '8' in "), + ('n=20', "n=15", "Out of bounds text sizing value '20' in "), + ('d=99', "d=15", "Out of bounds text sizing value '99' in "), + ('v=5', "v=2", "Out of bounds text sizing value '5' in "), + ('h=3', "h=2", "Out of bounds text sizing value '3' in "), ] -@pytest.mark.parametrize('raw,expected', PARSE_PARAMS_CASES) -def test_parse_text_sizing_params(raw, expected): - assert parse_text_sizing_params(raw) == expected +@pytest.mark.parametrize('given_params,expected_remainder,expected_exc,', CONTROL_CODES_PARAMS_CASES) +def test_text_sizing_params_control_codes(given_params, expected_remainder, expected_exc): + """Verify control_codes='strict' and 'parse' behavior in TextSizingParams.from_params().""" + # assert control_codes='strict' raises expected exception, + with pytest.raises(ValueError) as exc_info: + TextSizingParams.from_params(given_params, control_codes='strict') + assert exc_info.value.args[0].startswith(expected_exc) + # when 'parse' (default), any illegal argument or value is filtered, excluded, or clipped + params = TextSizingParams.from_params(given_params) + assert params.make_sequence() == expected_remainder -PARSE_PARAMS_CLAMP_CASES = [ - ('s=0', TextSizingParams(scale=1)), - ('s=9', TextSizingParams(scale=7)), - ('w=8', TextSizingParams(width=7)), - ('n=20', TextSizingParams(numerator=15)), - ('d=99', TextSizingParams(denominator=15)), - ('v=5', TextSizingParams(vertical_align=2)), - ('h=3', TextSizingParams(horizontal_align=2)), - ('w=-1', TextSizingParams(width=0)), -] - - -@pytest.mark.parametrize('raw,expected', PARSE_PARAMS_CLAMP_CASES) -def test_parse_text_sizing_params_clamp(raw, expected): - assert parse_text_sizing_params(raw) == expected - - -PARSE_PARAMS_EDGE_CASES = [ - ('unknown=5', TextSizingParams()), - ('s=2:unknown=5:w=3', TextSizingParams(scale=2, width=3)), - ('noequalssign', TextSizingParams()), - ('s=2:w=3:', TextSizingParams(scale=2, width=3)), - (':s=2', TextSizingParams(scale=2)), -] - -@pytest.mark.parametrize('raw,expected', PARSE_PARAMS_EDGE_CASES) -def test_parse_text_sizing_params_edge(raw, expected): - assert parse_text_sizing_params(raw) == expected - -# TODO: assert ValueError for all such values, when control_codes='parse', ignore, -# when control_codes='strict', raise decorated ValueError, 's', 'w', etc. -# ('s=', TextSizingParams()), -# ('s=abc', TextSizingParams()), - - -PARAMS_ROUNDTRIP_CASES = [ - TextSizingParams(), - TextSizingParams(scale=3), - TextSizingParams(scale=2, width=5), - TextSizingParams(scale=7, width=7, numerator=15, denominator=15, - vertical_align=2, horizontal_align=2), - TextSizingParams(numerator=1, denominator=2), -] - - -@pytest.mark.parametrize('params', PARAMS_ROUNDTRIP_CASES) -def test_params_roundtrip(params): - assert parse_text_sizing_params(_make_params_str(params)) == params - - -PARSE_SEQUENCE_CASES = [ - ('\x1b]66;s=2;hello\x07', - (TextSizingParams(scale=2), 'hello', '\x07')), - ('\x1b]66;s=99;hello\x07', - (TextSizingParams(scale=_META_FIELDS['s'].high), 'hello', '\x07')), - ('\x1b]66;s=-99;hello\x07', - (TextSizingParams(scale=_META_FIELDS['s'].low), 'hello', '\x07')), - ('\x1b]66;s=2;hello\x1b\\', - (TextSizingParams(scale=2), 'hello', '\x1b\\')), - ('\x1b]66;;text\x07', - (TextSizingParams(), 'text', '\x07')), - ('\x1b]66;s=3:w=2;\x07', - (TextSizingParams(scale=3, width=2), '', '\x07')), - ('\x1b]66;w=5;AB\x07', - (TextSizingParams(width=5), 'AB', '\x07')), - ('\x1b]66;s=7;' + ('X' * 30) + '\x07', - (TextSizingParams(scale=7), 'X' * 30, '\x07')), -] - - -@pytest.mark.parametrize('seq,expected', PARSE_SEQUENCE_CASES) -def test_parse_text_sizing(seq, expected): - assert parse_text_sizing(seq) == expected +@pytest.mark.parametrize('given_params,expected_remainder,expected_exc,', CONTROL_CODES_PARAMS_CASES) +def test_text_sizing_width_control_codes(given_params, expected_remainder, expected_exc): + """Verify control_codes='strict' with invalid OSC 66 sequences in wciwdth.width().""" + seq1 = '\x1b]66;' + given_params + ';ABC' + '\x07' + seq2 = '\x1b]66;' + given_params + ';ABC' + '\x1b\\' + for seq in (seq1, seq2): + with pytest.raises(ValueError) as exc_info: + wcwidth.width(seq, control_codes='strict') + assert exc_info.value.args[0].startswith(expected_exc) -PARSE_SEQUENCE_NONE_CASES = [ - '\x1b[31m', - '\x1b]0;title\x07', - '\x1b]65;s=2;text\x07', - 'plain text', - '', - '\x1b]66;missing_second_semi\x07', -] - - -@pytest.mark.parametrize('seq', PARSE_SEQUENCE_NONE_CASES) -def test_parse_text_sizing_none(seq): - assert parse_text_sizing(seq) is None - - -TEXT_SIZING_WIDTH_CASES = [ +@pytest.mark.parametrize('params,text,expected_width', [ + # cases of static width=N values, (TextSizingParams(scale=2, width=3), 'anything', 6), (TextSizingParams(scale=1, width=5), '', 5), (TextSizingParams(scale=3, width=1), 'x', 3), - (TextSizingParams(scale=1, width=0), 'AB', 2), - (TextSizingParams(scale=2, width=0), 'AB', 4), - (TextSizingParams(scale=1, width=0), '\u4e2d', 2), - (TextSizingParams(scale=2, width=0), '\u4e2d', 4), - (TextSizingParams(scale=1, width=0), '', 0), - (TextSizingParams(scale=3, width=0), '', 0), -] - - -@pytest.mark.parametrize('params,inner,expected', TEXT_SIZING_WIDTH_CASES) -def test_text_sizing_width(params, inner, expected): - assert text_sizing_width(params, inner) == expected + # and automatic width (width=0) values, + (TextSizingParams(scale=1), 'AB', 2), + (TextSizingParams(scale=2), 'AB', 4), + (TextSizingParams(scale=1), '中', 2), + (TextSizingParams(scale=2), '中', 4), + (TextSizingParams(scale=1), '', 0), + (TextSizingParams(scale=3), '', 0), +]) +def test_text_sizing_width(params, text, expected_width): + """Verify width using with both kinds of terminator.""" + assert TextSizing(params, text, terminator='\x07').display_width() == expected_width + assert TextSizing(params, text, terminator='\x1b\\').display_width() == expected_width + seq1 = TextSizing(params, text, terminator='\x07').make_sequence() + seq2 = TextSizing(params, text, terminator='\x1b\\').make_sequence() + assert wcwidth.width(seq1) == expected_width + assert wcwidth.width(seq2) == expected_width + + +# ('abc\x1b]66;w=3;x\x07def', 'x', 'w=3', 7), +# ('\x1b[31m\x1b]66;w=2;AB\x07\x1b[0m', 2), +@pytest.mark.parametrize('given_sequence,expected_text,expected_params,expected_width', [ + ('\x1b]66;s=2:w=2;AB\x07', 'AB', 's=2:w=2', 4), + ('\x1b]66;s=2:w=2;\u4e2d\x07', '\u4e2d', 's=2:w=2', 4), + ('\x1b]66;s=3:w=1;x\x07', 'x', 's=3:w=1', 3), + ('\x1b]66;w=5;hello\x07', 'hello', 'w=5', 5), + ('\x1b]66;s=2:w=3;anything\x07', 'anything', 's=2:w=3', 6), + ('\x1b]66;w=3;x\x07', 'x', 'w=3', 3), + ('\x1b]66;s=1;AB\x07', 'AB', '', 2), + ('\x1b]66;s=2;AB\x07', 'AB', 's=2', 4), + ('\x1b]66;s=2;中\x07', '中', 's=2', 4), + ('\x1b]66;s=2;\x07', '', 's=2', 0), + ('\x1b]66;s=1:w=1;\x07', '', 'w=1', 1), + ('\x1b]66;w=2;A\x07', 'A', 'w=2', 2), + ('\x1b]66;s=2:w=3;text\x1b\\', 'text', 's=2:w=3', 6), +]) +def test_text_sizing_scale_width(given_sequence, expected_text, expected_params, expected_width): + ts_match = TEXT_SIZING_PATTERN.match(given_sequence) + assert ts_match is not None + text_size = TextSizing.from_match(ts_match) + assert text_size.params.make_sequence() == expected_params + assert text_size.text == expected_text + assert wcwidth.width(given_sequence, control_codes='parse') == expected_width + assert wcwidth.width(given_sequence, control_codes='strict') == expected_width + assert wcwidth.width(given_sequence, control_codes='ignore') == wcwidth.wcswidth(expected_text) -MAKE_SEQUENCE_CASES = [ - ('hi', dict(scale=2, width=1), '\x07', - '\x1b]66;s=2:w=1;hi\x07'), - ('AB', dict(scale=2, width=2), '\x1b\\', - '\x1b]66;s=2:w=2;AB\x1b\\'), - ('x', {}, '\x07', - '\x1b]66;;x\x07'), - ('', dict(scale=3, width=2), '\x07', - '\x1b]66;s=3:w=2;\x07'), -] - - -@pytest.mark.parametrize('text,kwargs,term,expected', MAKE_SEQUENCE_CASES) -def test_make_sequence(text, kwargs, term, expected): - assert _make_seq(text, terminator=term, **kwargs) == expected - - -WRAP_CASES = [ - (dict(text='AB', scale=2, width=2), - '\x1b]66;s=2:w=2;AB\x07'), - (dict(text='AB', scale=2, width=2, terminator='\x1b\\'), - '\x1b]66;s=2:w=2;AB\x1b\\'), - (dict(text='x', scale=1), - '\x1b]66;;x\x07'), - (dict(text='hi', scale=3, width=1, numerator=1, denominator=2, - vertical_align=1, horizontal_align=2), - '\x1b]66;s=3:w=1:n=1:d=2:v=1:h=2;hi\x07'), -] - - -@pytest.mark.parametrize('kwargs,expected', WRAP_CASES) -def test_wrap(kwargs, expected): - text = kwargs.pop('text') - terminator = kwargs.pop('terminator', '\x07') - assert _make_seq(text, terminator=terminator, **kwargs) == expected - - -SCALE_CASES = [ - ('AB', 2, '\x1b]66;s=2:w=2;AB\x07'), - ('\u4e2d', 2, '\x1b]66;s=2:w=2;\u4e2d\x07'), - ('x', 3, '\x1b]66;s=3:w=1;x\x07'), - ('hello', 1, '\x1b]66;w=5;hello\x07'), +WIDTH_PARSE_IGNORED_CASES = [ + # when control_codes='ignore', only the 'inner text' width is naturally + # measured, its ] -@pytest.mark.parametrize('text,scale,expected', SCALE_CASES) -def test_scale(text, scale, expected): - inner_w = wcwidth.wcswidth(text) - assert _make_seq(text, scale=scale, width=max(0, inner_w)) == expected - - -def test_scale_st_terminator(): - text, scale = 'AB', 2 - inner_w = wcwidth.wcswidth(text) - result = _make_seq(text, scale=scale, width=max(0, inner_w), terminator='\x1b\\') - assert result == '\x1b]66;s=2:w=2;AB\x1b\\' - +@pytest.mark.parametrize('text,expected', WIDTH_PARSE_IGNORED_CASES) +def test_width_text_sizing_ignored(text, expected): + assert wcwidth.width(text, control_codes='ignore') == expected -# --- Integration tests: width() --- WIDTH_PARSE_CASES = [ ('\x1b]66;s=2:w=3;anything\x07', 6), @@ -245,32 +130,11 @@ def test_width_text_sizing_parse(text, expected): assert wcwidth.width(text) == expected -WIDTH_PARSE_IGNORED_CASES = [ - # when ignored, only the 'inner text' width is measured - ('\x1b]66;s=2:w=3;anything\x07', 8), - ('\x1b]66;w=3;x\x07', 1), - ('\x1b]66;s=1:w=0;AB\x07', 2), - ('\x1b]66;s=2:w=0;AB\x07', 2), - ('\x1b]66;s=2:w=0;\u4e2d\x07', 2), # '中' - ('\x1b]66;s=1:w=0;\x07', 0), - ('abc\x1b]66;w=3;x\x07def', 7), - ('\x1b]66;w=2;A\x07\x1b]66;w=3;B\x07', 2), - ('\x1b]66;s=2:w=3;text\x1b\\', 4), - ('\x1b[31m\x1b]66;w=2;AB\x07\x1b[0m', 2), -] - -@pytest.mark.parametrize('text,expected', WIDTH_PARSE_IGNORED_CASES) -def test_width_text_sizing_ignored(text, expected): - assert wcwidth.width(text, control_codes='ignore') == expected - - @pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) def test_width_text_sizing_strict(text, expected): assert wcwidth.width(text, control_codes='strict') == expected -# --- Integration tests: strip_sequences() --- - STRIP_TEXT_SIZING_CASES = [ ('\x1b]66;s=2;hello\x07', 'hello'), ('\x1b]66;s=2;hello\x1b\\', 'hello'), @@ -287,8 +151,6 @@ def test_strip_sequences_text_sizing(text, expected): assert wcwidth.strip_sequences(text) == expected -# --- Integration tests: iter_sequences() --- - def test_iter_sequences_text_sizing(): text = 'abc\x1b]66;s=2;hello\x07def' segments = list(wcwidth.iter_sequences(text)) @@ -305,7 +167,118 @@ def test_iter_sequences_text_sizing_st(): assert segments == [('\x1b]66;w=2;AB\x1b\\', True)] -# --- Integration tests: clip() --- +# ___REPLACE_PADDING_CASES = [ +# ('\x1b]66;w=3;x\x07', ' '), +# ('\x1b]66;s=2:w=2;AB\x07', ' '), +# ('abc\x1b]66;w=1;x\x07def', 'abc def'), +# ('no text sizing here', 'no text sizing here'), +# ] +# +# +# +# +# +# CONTROL_CODES_WIDTH_CASES = [ +# ('hi', dict(scale=2, width=1), '\x07', +# '\x1b]66;s=2:w=1;hi\x07'), +# ('AB', dict(scale=2, width=2), '\x1b\\', +# '\x1b]66;s=2:w=2;AB\x1b\\'), +# ('x', {}, '\x07', +# '\x1b]66;;x\x07'), +# ('', dict(scale=3, width=2), '\x07', +# '\x1b]66;s=3:w=2;\x07'), +# ] +# MAKE_SEQUENCE_CASES = [ + +# +# WRAP_CASES = [ +# (TextSizingParams(scale=2, width=2), +# '\x1b]66;s=2:w=2;ABC\x1b\\'), +# (TextSizingParams(scale=2, width=2), +# '\x1b]66;s=2:w=2;ABC\x1b\\'), +# (TextSizingParams(scale=1), +# '\x1b]66;;ABC\x1b\\'), +# (TextSizingParams(scale=3, width=1, numerator=1, denominator=2, +# vertical_align=1, horizontal_align=2), +# '\x1b]66;s=3:w=1:n=1:d=2:v=1:h=2;ABC\x1b\\'), +# ] +# +# @pytest.mark.parametrize('params,expected', WRAP_CASES) +# def test_wrap(params, expected): +# text = 'ABC' +# terminator = '\x1b\\' +# assert TextSizing(params, text, terminator).make_sequence() == expected +# +# def test_scale_st_terminator(): +# text, scale = 'AB', 2 +# inner_w = wcwidth.wcswidth(text) +# result = _build_seq(text, +# TextSizingParams(scale=scale, width=max(0, inner_w)), +# terminator='\x1b\\') +# assert result == '\x1b]66;s=2:w=2;AB\x1b\\' +# +# +# @pytest.mark.parametrize('text,kwargs,term,expected', MAKE_SEQUENCE_CASES) +# def test_make_sequence(text, kwargs, term, expected): +# assert TextSizing(text, terminator=term, **kwargs) == expected +# +# +# @pytest.mark.parametrize('raw,expected', PARSE_PARAMS_EDGE_CASES) +# def test_parse_text_sizing_params_edge(raw, expected): +# assert _parse_text_sizing_params(raw) == expected +# +# +# PARAMS_ROUNDTRIP_CASES = [ +# TextSizingParams(), +# TextSizingParams(scale=3), +# TextSizingParams(scale=2, width=5), +# TextSizingParams(scale=7, width=7, numerator=15, denominator=15, +# vertical_align=2, horizontal_align=2), +# TextSizingParams(numerator=1, denominator=2), +# ] +# +# @pytest.mark.parametrize('params', PARAMS_ROUNDTRIP_CASES) +# def test_params_roundtrip(params): +# text_size = TextSizing(params, "abc", terminator="\x07") +# #assert _parse_text_sizing_params(_make_params_str(params)) == params + +# PARSE_PARAMS_CASES = [ +# ('', TextSizingParams()), +# ('s=2', TextSizingParams(scale=2)), +# ('w=3', TextSizingParams(width=3)), +# ('s=2:w=3', TextSizingParams(scale=2, width=3)), +# ('s=2:w=3:n=1:d=2:v=1:h=2', +# TextSizingParams(scale=2, width=3, numerator=1, denominator=2, +# vertical_align=1, horizontal_align=2)), +# ('n=5:d=10', TextSizingParams(numerator=5, denominator=10)), +# ('v=0:h=0', TextSizingParams()), +# ('s=1:w=0', TextSizingParams()), +# ] + +# PARSE_SEQUENCE_CASES = [ +# ('\x1b]66;s=2;hello\x07', +# (TextSizingParams(scale=2), 'hello', '\x07')), +# ('\x1b]66;s=99;hello\x07', +# (TextSizingParams(scale=TextSizingParams.FIELD_MAPPING['s'].high), 'hello', '\x07')), +# ('\x1b]66;s=-99;hello\x07', +# (TextSizingParams(scale=TextSizingParams.FIELD_MAPPING['s'].low), 'hello', '\x07')), +# ('\x1b]66;s=2;hello\x1b\\', +# (TextSizingParams(scale=2), 'hello', '\x1b\\')), +# ('\x1b]66;;text\x07', +# (TextSizingParams(), 'text', '\x07')), +# ('\x1b]66;s=3:w=2;\x07', +# (TextSizingParams(scale=3, width=2), '', '\x07')), +# ('\x1b]66;w=5;AB\x07', +# (TextSizingParams(width=5), 'AB', '\x07')), +# ('\x1b]66;s=7;' + ('X' * 30) + '\x07', +# (TextSizingParams(scale=7), 'X' * 30, '\x07')), +# ] + +# +# @pytest.mark.parametrize('seq,expected', PARSE_SEQUENCE_CASES) +# def test_parse_text_sizing(seq, expected): +# assert parse_text_sizing(seq) == expected + CLIP_TEXT_SIZING_CASES = [ ('\x1b]66;w=3;ABC\x07', 0, 3, '\x1b]66;w=3;ABC\x07'), @@ -320,18 +293,3 @@ def test_iter_sequences_text_sizing_st(): @pytest.mark.parametrize('text,start,end,expected', CLIP_TEXT_SIZING_CASES) def test_clip_text_sizing(text, start, end, expected): assert wcwidth.clip(text, start, end) == expected - - -# --- Internal helper --- - -REPLACE_PADDING_CASES = [ - ('\x1b]66;w=3;x\x07', ' '), - ('\x1b]66;s=2:w=2;AB\x07', ' '), - ('abc\x1b]66;w=1;x\x07def', 'abc def'), - ('no text sizing here', 'no text sizing here'), -] - - -@pytest.mark.parametrize('text,expected', REPLACE_PADDING_CASES) -def test_replace_text_sizing_with_padding(text, expected): - assert _replace_text_sizing_with_padding(text) == expected diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 7cd68e6..7da330c 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -30,15 +30,14 @@ from .grapheme import iter_graphemes, iter_graphemes_reverse from .textwrap import SequenceTextWrapper, wrap from .sgr_state import propagate_sgr -from .text_sizing import TextSizingParams, parse_text_sizing +from .text_sizing import TextSizing, TextSizingParams # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". __all__ = ('wcwidth', 'wcswidth', 'width', 'iter_sequences', 'iter_graphemes', 'iter_graphemes_reverse', 'grapheme_boundary_before', 'ljust', 'rjust', 'center', 'wrap', 'clip', 'strip_sequences', - 'list_versions', 'propagate_sgr', - 'TextSizingParams', 'parse_text_sizing') + 'list_versions', 'propagate_sgr', 'TextSizing', 'TextSizingParams') # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py index 1c9d310..555f0a5 100644 --- a/wcwidth/text_sizing.py +++ b/wcwidth/text_sizing.py @@ -20,6 +20,8 @@ - When ``w > 0``, return ``s * w``. - Otherwise ``w == 0``, ``s * wcswidth(inner_text_width)`` cells. +Numerator, denominator, and alignment codes and values are parsed but otherwise ignored +and have no effect on measurements made in this library. .. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ .. versionadded:: 0.6.0 @@ -27,31 +29,19 @@ from __future__ import annotations import typing -from typing import NamedTuple - -# local -from .escape_sequences import TEXT_SIZING_PATTERN if typing.TYPE_CHECKING: # pragma: no cover # std imports import re -class _MetaField(NamedTuple): + +class _FieldMeta(typing.NamedTuple): name: str low: int high: int -_META_FIELDS: dict[str, MetaField] = { - 's': _MetaField('scale', low=1, high=7), - 'w': _MetaField('width', low=0, high=7), - 'n': _MetaField('numerator', low=0, high=15), - 'd': _MetaField('denominator', low=0, high=15), - 'v': _MetaField('vertical_align', low=0, high=2), - 'h': _MetaField('horizontal_align', low=0, high=2), -} - -class TextSizingParams(NamedTuple): +class TextSizingParams(typing.NamedTuple): """ Parsed parameters from a text sizing escape sequence (OSC 66). @@ -64,7 +54,6 @@ class TextSizingParams(NamedTuple): :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). :param horizontal_align: Horizontal alignment (0=left, 1=right, 2=center). """ - scale: int = 1 width: int = 0 numerator: int = 0 @@ -72,115 +61,119 @@ class TextSizingParams(NamedTuple): vertical_align: int = 0 horizontal_align: int = 0 - -def parse_text_sizing_params(raw: str, control_codes='parse') -> TextSizingParams: - """ - Parse colon-separated ``key=value`` metadata string. - - :param raw: Metadata string, e.g. ``'s=2:w=3'``. - :param control_does: 'parse' or 'strict'. - :raises ValueError: If ``control_codes='strict'`` unrecognized text sizing parameters raise - ValueError. - :returns: Parsed parameters with values clamped to valid ranges. - Unknown keys are ignored. Non-integer values use defaults. - - Example:: - - >>> parse_text_sizing_params('s=2:w=3') - TextSizingParams(scale=2, width=3, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) - >>> parse_text_sizing_params('') - TextSizingParams(scale=1, width=0, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) - """ - kwargs: dict[str, int] = {} - if not raw: - return TextSizingParams() - for part in raw.split(':'): - if '=' not in part: - continue - key, _eq, val = part.partition('=') - field = _META_FIELDS.get(key) - if field is None: - if control_codes == 'strict': - raise ValueError(f"Unknown text sizing field '{key}' in OSC 66 sequence, {raw!r}") - # ignore unknown fields unless 'strict' - continue - try: - value = int(val) - except ValueError as exc: - if control_does == 'strict': - raise ValueError(f"Illegal text sizing value '{val}' " - f"in OSC 66 sequence, {raw!r}: {exc}") - # ignore value, using default, unless 'strict' - continue - kwargs[field.name] = max(field.low, min(field.high, value)) - return TextSizingParams(**kwargs) - - -def parse_text_sizing(seq: str) -> tuple[TextSizingParams, str, str] | None: - r""" - Parse a complete text sizing escape sequence (OSC 66). - - :param seq: Full escape sequence string. - :returns: Tuple of ``(params, inner_text, terminator)`` or ``None`` - if the string is not a valid text sizing sequence. - - Example:: - - >>> parse_text_sizing('\x1b]66;s=2;hello\x07') - (TextSizingParams(scale=2, ...), 'hello', '\x07') - >>> parse_text_sizing('\x1b[31m') - None - """ - match = TEXT_SIZING_PATTERN.fullmatch(seq) - if not match: - return None - return ( - parse_text_sizing_params(match.group(1)), - match.group(2), - match.group(3), - ) - - -def text_sizing_width( - params: TextSizingParams, - inner_text: str, - ambiguous_width: int = 1, -) -> int: - """ - Calculate the display width of a text sizing sequence. - - :param params: Parsed parameters. - :param inner_text: The text payload of the sequence. - :param ambiguous_width: Width for East Asian Ambiguous characters. - :returns: Display width in terminal cells. - - When ``params.width > 0``, returns ``params.scale * params.width``. - When ``params.width == 0``, returns ``params.scale * measured_inner_width``. - """ - if params.width > 0: - return params.scale * params.width - # Lazy import to avoid circular dependency - # pylint: disable=import-outside-toplevel - # local - from .wcwidth import wcswidth - inner_w = wcswidth(inner_text, ambiguous_width=ambiguous_width) - return params.scale * max(0, inner_w) - - -def _replace_text_sizing_with_padding( - text: str, - ambiguous_width: int = 1, -) -> str: - """ - Replace each text sizing sequence with spaces matching its declared width. - - Used internally by ``_width_ignored_codes`` to account for text sizing - width before stripping other sequences. - """ - def _replacer(match: 're.Match[str]') -> str: - params = parse_text_sizing_params(match.group(1)) - inner_text = match.group(2) - w = text_sizing_width(params, inner_text, ambiguous_width) - return ' ' * w - - return TEXT_SIZING_PATTERN.sub(_replacer, text) + FIELD_MAPPING = {'s': _FieldMeta(name='scale', low=1, high=7), + 'w': _FieldMeta(name='width', low=0, high=7), + 'n': _FieldMeta(name='numerator', low=0, high=15), + 'd': _FieldMeta(name='denominator', low=0, high=15), + 'v': _FieldMeta(name='vertical_align', low=0, high=2), + 'h': _FieldMeta(name='horizontal_align', low=0, high=2)} + + def make_sequence(self) -> str: + """Build and return sub-part of an OSC 66 sequence.""" + parts = [] + default_params = TextSizingParams() + # build string for all known parameters of non-default values + for field_key, field in self.FIELD_MAPPING.items(): + val = getattr(self, field.name) + default_val = getattr(default_params, field.name) + if val != default_val: + parts.append(f'{field_key}={val}') + return ':'.join(parts) + + @classmethod + def from_params(cls, raw: str, control_codes: str = 'parse') -> TextSizingParams: + """ + Parse colon-separated ``key=value`` metadata string. + + :param raw: Metadata string, e.g. ``'s=2:w=3'``. + :param control_codes: 'parse' or 'strict'. + :raises ValueError: If ``control_codes='strict'`` unrecognized text sizing parameters raise + ValueError. + :returns: Parsed parameters with values clamped to valid ranges. + Unknown keys are ignored. Non-integer values use defaults. + + Example:: + + >>> _parse_text_sizing_params('s=2:w=3') + TextSizingParams(scale=2, width=3, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) + >>> _parse_text_sizing_params('') + TextSizingParams(scale=1, width=0, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) + """ + kwargs: dict[str, int] = {} + for part in raw.split(':'): + if '=' not in part: + if control_codes == 'strict': + raise ValueError(f"Expected '=' in text sizing parameter (key=val), got {part!r}") + continue + key, _eq, val = part.partition('=') + field = TextSizingParams.FIELD_MAPPING.get(key) + if field is None: + if control_codes == 'strict': + raise ValueError(f"Unknown text sizing field '{key}' in OSC 66 sequence, {raw!r}") + # ignore unknown fields unless 'strict' + continue + try: + value = int(val) + except ValueError as exc: + if control_codes == 'strict': + raise ValueError(f"Illegal text sizing value '{val}' " + f"in OSC 66 sequence, {raw!r}: {exc}") from exc + # ignore value, uses default value without warning unless 'strict' + continue + if control_codes == 'strict' and (value > field.high or value < field.low): + raise ValueError(f"Out of bounds text sizing value '{val}' " + f"in OSC 66 sequence, {raw!r}: " + f"allowed range for '{key}' ({field.name}) is {field.low} to {field.high}") + kwargs[field.name] = max(field.low, min(field.high, value)) + return cls(**kwargs) + + +class TextSizing(typing.NamedTuple): + params: TextSizingParams + text: str + terminator: str + + @classmethod + def from_match(cls, match: re.Match, control_codes='parse') -> TextSizing: + """ + Parse using matching OSC 66 Sequence. + + :param match: match object from :attr:`wcwidth.escape_sequences.TEXT_SIZING_PATTERN`. + :param control_codes: 'parse' or 'strict', same meaning and delegated by + :func:`wcwidth.width`. + :raises ValueError: When ``control_codes='strict'`` for unrecognized, invalid, or out of + bounds text sizing parameters. + :returns: TextSizing object from parsed sequence + + Example:: + + >>> _parse_text_sizing_params('s=2:w=3') + TextSizingParams(scale=2, width=3, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) + >>> _parse_text_sizing_params('') + TextSizingParams(scale=1, width=0, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) + """ + return cls(params=TextSizingParams.from_params(match.group(1), control_codes=control_codes), + text=match.group(2), + terminator=match.group(3)) + + def display_width(self, ambiguous_width: int = 1) -> int: + """ + Calculate the display width of a text sizing sequence. + + :param ambiguous_width: Width for East Asian Ambiguous characters. + :returns: Display width in terminal cells. + + When ``width > 0``, returns ``params.scale * params.width``. + When ``width == 0``, returns ``params.scale * measured_inner_width``. + """ + if self.params.width > 0: + return self.params.scale * self.params.width + # pylint: disable=import-outside-toplevel + # local + import wcwidth # Lazy import to avoid circular dependency + w = wcwidth.wcswidth(self.text, ambiguous_width=ambiguous_width) + return self.params.scale * max(0, w) + + def make_sequence(self) -> str: + """Build and return complete OSC 66 Terminal Sequence.""" + return f'\x1b]66;{self.params.make_sequence()};{self.text}{self.terminator}' diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index 4582cd5..5f53715 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -14,7 +14,7 @@ from typing import TYPE_CHECKING, NamedTuple # local -from .wcwidth import width as _width +from .wcwidth import width as wcwidth_width from .wcwidth import iter_sequences from .grapheme import iter_graphemes from .sgr_state import propagate_sgr as _propagate_sgr @@ -99,8 +99,8 @@ def _next_hyperlink_id() -> str: def _width(self, text: str) -> int: """Measure text width accounting for sequences.""" - return _width(text, control_codes=self.control_codes, tabsize=self.tabsize, - ambiguous_width=self.ambiguous_width) + return wcwidth_width(text, control_codes=self.control_codes, tabsize=self.tabsize, + ambiguous_width=self.ambiguous_width) def _strip_sequences(self, text: str) -> str: """Strip all terminal sequences from text.""" diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index d6b915a..911b4a4 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -80,8 +80,7 @@ from .table_vs16 import VS16_NARROW_TO_WIDE from .table_wide import WIDE_EASTASIAN from .table_zero import ZERO_WIDTH -from .text_sizing import text_sizing_width as _text_sizing_width -from .text_sizing import parse_text_sizing_params, _replace_text_sizing_with_padding +from .text_sizing import TextSizing from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL from .table_grapheme import ISC_CONSONANT, EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR from .table_ambiguous import AMBIGUOUS_EASTASIAN @@ -535,8 +534,8 @@ def width( 1 """ # pylint: disable=too-complex,too-many-branches,too-many-statements,too-many-locals - # This could be broken into sub-functions (#1, #3, and 6 especially), but for reduced overhead - # considering this function is a likely "hot path", they are inlined, breaking many of our + # This could be broken into sub-functions (#1, #3, and #6 especially), but for reduced overhead + # in consideration of this function a likely "hot path", they are inline, breaking many pylint # complexity rules. # Fast path for ASCII printable (no tabs, escapes, or control chars) @@ -583,16 +582,12 @@ def width( # 1. Handle ESC sequences if char == '\x1b': # 1a. OSC 66 (kitty text sizing) positive width - if text[idx:idx + 5] == '\x1b]66;': - ts_match = TEXT_SIZING_PATTERN.match(text, idx) - if ts_match: - meta = parse_text_sizing_params(ts_match.group(1)) - current_col += _text_sizing_width( - meta, ts_match.group(2), ambiguous_width - ) - idx = ts_match.end() - max_extent = max(max_extent, current_col) - continue + if (ts_match := TEXT_SIZING_PATTERN.match(text, idx)): + text_size = TextSizing.from_match(ts_match, control_codes=control_codes) + current_col += text_size.display_width(ambiguous_width) + max_extent = max(max_extent, current_col) + idx = ts_match.end() + continue # 1b. Check all other "zero-width" terminal sequences match = ZERO_WIDTH_PATTERN.match(text, idx) if match: @@ -985,32 +980,29 @@ def clip( # Handle escape sequences if char == '\x1b': - # OSC 66 (text sizing) has positive width — handle before zero-width path - if text[idx:idx + 5] == '\x1b]66;': - ts_match = TEXT_SIZING_PATTERN.match(text, idx) - if ts_match: - meta = parse_text_sizing_params(ts_match.group(1)) - w = _text_sizing_width( - meta, ts_match.group(2), ambiguous_width - ) - if w == 0: - if start <= col < end: - output.append(ts_match.group()) - elif col >= start and col + w <= end: - output.append(ts_match.group()) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - col += w - elif col < end and col + w > start: - visible = min(end, col + w) - max(start, col) - output.append(fillchar * visible) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - col += w - else: - col += w - idx = ts_match.end() - continue + # OSC 66 (text sizing) has positive width, handle before zero-width path + if (ts_match := TEXT_SIZING_PATTERN.match(text, idx)): + text_size = TextSizing.from_match(ts_match, control_codes='parse') + w = text_size.display_width(ambiguous_width) + if col >= start and col + w <= end: + # fits as-is, keep going + output.append(ts_match.group()) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + col += w + elif col < end and col + w > start: + # TODO: currently we just replace it entirely with '***', + # when, we should instead "chop up" the text to fit .. + # this function is sparingly used, but it should handle OSC 66 correctly + visible = min(end, col + w) - max(start, col) + output.append(fillchar * visible) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + col += w + else: + col += w + idx = ts_match.end() + continue if (match := ZERO_WIDTH_PATTERN.match(text, idx)): seq = match.group() From a2cde0960cff20ff763ead0eb873bd773d847055 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Mon, 27 Apr 2026 18:35:07 -0400 Subject: [PATCH 09/70] finish gross refactor, preventing circular imports moving lots of things from wcwidth.wcwidth to sub-modules, _constants, _wcswidth, _wcwidth, and _width, _constants, and escap_esequences, notably so they can be shared, such as _wcswidth in new wcwidth.text_sizing, and existing iter_sequences by wcwidth.textwrap without any "late-binding import" to avoid circular import. --- docs/api.rst | 4 + docs/intro.rst | 3 + docs/unicode_version.rst | 15 + pyproject.toml | 2 +- requirements-tests38.in | 2 +- requirements-tests38.txt | 10 +- requirements-tests39.in | 2 +- requirements-tests39.txt | 19 +- tests/test_benchmarks.py | 21 +- tests/test_core.py | 16 +- tox.ini | 1 - wcwidth/__init__.py | 2 +- wcwidth/_constants.py | 52 +++ wcwidth/_wcswidth.py | 152 +++++++++ wcwidth/_wcwidth.py | 63 ++++ wcwidth/_width.py | 300 +++++++++++++++++ wcwidth/escape_sequences.py | 85 +++++ wcwidth/text_sizing.py | 74 +++-- wcwidth/textwrap.py | 11 +- wcwidth/wcwidth.py | 644 ++---------------------------------- 20 files changed, 788 insertions(+), 690 deletions(-) create mode 100644 wcwidth/_constants.py create mode 100644 wcwidth/_wcswidth.py create mode 100644 wcwidth/_wcwidth.py create mode 100644 wcwidth/_width.py diff --git a/docs/api.rst b/docs/api.rst index 55d288b..a80eb40 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -36,4 +36,8 @@ requirements.txt or equivalent. Their signatures will never change. .. autofunction:: wcwidth.list_versions +.. autofunction:: wcwidth.TextSizing + +.. autofunction:: wcwidth.TextSizingParams + .. _SEMVER: https://semver.org diff --git a/docs/intro.rst b/docs/intro.rst index 0f1e762..e80b021 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -468,6 +468,9 @@ languages. History ======= +0.6.1 *2026-04-26* + * **New** `width()` now supports `Kitty Text Sizing Protocol`_ (OSC 66). + 0.6.0 *2026-02-06* * **New** Parameters ``expand_tabs``, ``replace_whitespace``, ``fix_sentence_endings``, ``drop_whitespace``, ``max_lines``, and ``placeholder`` for `wrap()`_, completing stdlib diff --git a/docs/unicode_version.rst b/docs/unicode_version.rst index 41a1e52..38ff78d 100644 --- a/docs/unicode_version.rst +++ b/docs/unicode_version.rst @@ -16,6 +16,21 @@ release files: ``emoji-variation-sequences-12.0.0.txt`` *Date: 2019-01-15, 12:10:05 GMT* +``emoji-variation-sequences-13.0.0.txt`` + *Date: 2020-01-21, 07:15:05 GMT* + +``emoji-variation-sequences-14.0.0.txt`` + *Date: 2021-06-08, 05:19:16 GMT* + +``emoji-variation-sequences-15.0.0.txt`` + *Date: 2022-05-13, 21:54:24 GMT* + +``emoji-variation-sequences-15.1.0.txt`` + *Date: 2023-02-01, 02:22:54 GMT* + +``emoji-variation-sequences-16.0.0.txt`` + *Date: 2024-05-01, 21:25:24 GMT* + ``emoji-variation-sequences-17.0.0.txt`` *Date: 2025-01-30, 21:48:29 GMT* diff --git a/pyproject.toml b/pyproject.toml index 0fed636..a713f87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "hatchling" ] [project] name = "wcwidth" -version = "0.6.0" +version = "0.6.1" # don't forget to also update wcwidth/__init__.py:__version__ description = "Measures the displayed width of unicode strings in a terminal" readme = "README.rst" keywords = [ diff --git a/requirements-tests38.in b/requirements-tests38.in index 19efdeb..fca2238 100644 --- a/requirements-tests38.in +++ b/requirements-tests38.in @@ -1,6 +1,6 @@ # for python3.8 pytest<7 pytest-cov -pytest-xdist coverage[toml]<6 packaging<26 +pytest-benchmark<5 diff --git a/requirements-tests38.txt b/requirements-tests38.txt index 0b9d25e..d043f15 100644 --- a/requirements-tests38.txt +++ b/requirements-tests38.txt @@ -10,8 +10,6 @@ coverage==5.5 # via # -r requirements-tests38.in # pytest-cov -execnet==2.1.2 - # via pytest-xdist iniconfig==2.1.0 # via pytest packaging==25.0 @@ -22,14 +20,16 @@ pluggy==1.5.0 # via pytest py==1.11.0 # via pytest +py-cpuinfo==9.0.0 + # via pytest-benchmark pytest==6.2.5 # via # -r requirements-tests38.in + # pytest-benchmark # pytest-cov - # pytest-xdist -pytest-cov==5.0.0 +pytest-benchmark==4.0.0 # via -r requirements-tests38.in -pytest-xdist==3.5.0 +pytest-cov==5.0.0 # via -r requirements-tests38.in toml==0.10.2 # via diff --git a/requirements-tests39.in b/requirements-tests39.in index 8c7d45f..a8bf293 100644 --- a/requirements-tests39.in +++ b/requirements-tests39.in @@ -1,9 +1,9 @@ # For Python 3.9 *and newer* pytest>=7.4.2 pytest-cov>=4.1.0 -pytest-xdist pytest-codspeed importlib-metadata<8.7.1 packaging<26.0 tomli<2.3.0 cffi<2 +pytest-benchmark diff --git a/requirements-tests39.txt b/requirements-tests39.txt index 65682e5..18fc9df 100644 --- a/requirements-tests39.txt +++ b/requirements-tests39.txt @@ -4,7 +4,6 @@ # # pip-compile --allow-unsafe --no-emit-index-url --output-file=requirements-tests39.txt --strip-extras requirements-tests39.in # - cffi==1.17.1 # via # -r requirements-tests39.in @@ -13,8 +12,6 @@ coverage==7.10.7 # via pytest-cov exceptiongroup==1.3.1 # via pytest -execnet==2.1.2 - # via pytest-xdist importlib-metadata==8.7.0 # via # -r requirements-tests39.in @@ -33,25 +30,27 @@ pluggy==1.6.0 # via # pytest # pytest-cov +py-cpuinfo==9.0.0 + # via pytest-benchmark pycparser==2.23 # via cffi -pygments==2.19.2 +pygments==2.20.0 # via # pytest # rich pytest==8.4.2 # via # -r requirements-tests39.in + # pytest-benchmark # pytest-codspeed # pytest-cov - # pytest-xdist -pytest-codspeed==4.2.0 +pytest-benchmark==5.2.3 # via -r requirements-tests39.in -pytest-cov==7.0.0 +pytest-codspeed==4.4.0 # via -r requirements-tests39.in -pytest-xdist==3.8.0 +pytest-cov==7.1.0 # via -r requirements-tests39.in -rich==14.3.1 +rich==15.0.0 # via pytest-codspeed tomli==2.2.1 # via @@ -60,5 +59,5 @@ tomli==2.2.1 # pytest typing-extensions==4.15.0 # via exceptiongroup -zipp==3.23.0 +zipp==3.23.1 # via importlib-metadata diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index b85448e..e642a17 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -3,6 +3,7 @@ import os import sys import unicodedata +import platform # 3rd party import pytest @@ -10,7 +11,7 @@ # local import wcwidth -_wcwidth_module = sys.modules['wcwidth.wcwidth'] +_width_module = sys.modules['wcwidth._width'] def test_wcwidth_ascii(benchmark): @@ -373,8 +374,13 @@ def test_width_brahmic_bengali(benchmark): reason=f"{os.path.basename(UDHR_FILE)} is missing; run bin/update-tables.py", ) +_py38_skip_pedantic = pytest.mark.skipif( + sys.version_info[:2] < (3, 9), + reason=f'benchmark.pedantic() not supported in python 3.8 or earlier') + @_udhr_skip +@_py38_skip_pedantic def test_wrap_udhr(benchmark): """Benchmark wrap() with multilingual UDHR text.""" result = benchmark.pedantic(wcwidth.wrap, args=(UDHR_TEXT, 80), rounds=1, iterations=1) @@ -383,6 +389,7 @@ def test_wrap_udhr(benchmark): @_udhr_skip +@_py38_skip_pedantic def test_width_udhr(benchmark): """Benchmark width() with multilingual UDHR text.""" result = benchmark.pedantic(wcwidth.width, args=(UDHR_TEXT,), rounds=1, iterations=1) @@ -390,6 +397,7 @@ def test_width_udhr(benchmark): @_udhr_skip +@_py38_skip_pedantic def test_width_udhr_lines(benchmark): """Benchmark width() on individual UDHR lines.""" result = benchmark.pedantic(lambda: sum(wcwidth.width(line) for line in UDHR_LINES), @@ -398,6 +406,7 @@ def test_width_udhr_lines(benchmark): @_udhr_skip +@_py38_skip_pedantic def test_width_wcswidth_consistency_udhr(benchmark): """Verify width() and wcswidth() agree for printable multilingual text.""" def check(): @@ -415,23 +424,25 @@ def check(): @_udhr_skip +@_py38_skip_pedantic def test_width_fastpath_integrity_udhr(benchmark): """Verify width() produces identical results with and without the fast path.""" - saved = _wcwidth_module._WIDTH_FAST_PATH_MIN_LEN + saved = _width_module._WIDTH_FAST_PATH_MIN_LEN def check(): - _wcwidth_module._WIDTH_FAST_PATH_MIN_LEN = 0 + _width_module._WIDTH_FAST_PATH_MIN_LEN = 0 fast_total = sum(wcwidth.width(line) for line in UDHR_LINES) - _wcwidth_module._WIDTH_FAST_PATH_MIN_LEN = 999_999 + _width_module._WIDTH_FAST_PATH_MIN_LEN = 999_999 parse_total = sum(wcwidth.width(line) for line in UDHR_LINES) return fast_total, parse_total fast_total, parse_total = benchmark.pedantic(check, rounds=1, iterations=1) - _wcwidth_module._WIDTH_FAST_PATH_MIN_LEN = saved + _width_module._WIDTH_FAST_PATH_MIN_LEN = saved assert fast_total == parse_total @_udhr_skip +@_py38_skip_pedantic def test_ljust_udhr_lines(benchmark): """Benchmark ljust() on UDHR lines.""" benchmark.pedantic(lambda: [wcwidth.ljust(line, w + 1, UDHR_FILLCHAR) diff --git a/tests/test_core.py b/tests/test_core.py index 024dcdb..ba7b32e 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -8,9 +8,10 @@ # local import wcwidth +from wcwidth._width import _WIDTH_FAST_PATH_MIN_LEN _wcwidth_module = sys.modules['wcwidth.wcwidth'] -_WIDTH_FAST_PATH_MIN_LEN = _wcwidth_module._WIDTH_FAST_PATH_MIN_LEN +# local def test_package_version(): @@ -414,13 +415,12 @@ def test_bengali_nukta_mc(): @pytest.mark.parametrize("repeat", [1, _WIDTH_FAST_PATH_MIN_LEN]) def test_mc_width_consistency(repeat): - # width(), wcswidth(), and per-grapheme width sums must all agree. - # - # The repeat parameter ensures both the short (parse) and long (fast) code - # paths of width() are exercised. At repeat=1 the phrases are short enough - # to go through character-by-character parse mode. At repeat=_WIDTH_FAST_PATH_MIN_LEN - # every phrase exceeds the threshold and takes the fast path that delegates - # to wcswidth(). + """ + Check width() vs. + + wcswidth() consistency + """ + # repeat value 'WIDTH_FAST_PATH_MIN_LEN' ensures both "fast" and "slow" paths are taken phrases = [ "\u0915\u094D\u0937\u093F", "\u0b95\u0bcd\u0bb7\u0bcc", diff --git a/tox.ini b/tox.ini index 2915bed..a921a79 100644 --- a/tox.ini +++ b/tox.ini @@ -8,7 +8,6 @@ pip_compile_command = pip-compile --resolver=backtracking --strip-extras --no-em [testenv] deps = -r requirements-tests39.txt commands = {envpython} -m pytest --cov-config={toxinidir}/tox.ini {posargs:\ - -n auto \ --verbose \ --junit-xml=.tox/results.{envname}.xml \ --durations=3 \ diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 7da330c..d60fdf1 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -41,4 +41,4 @@ # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places -__version__ = '0.6.0' +__version__ = '0.6.1' # don't forget to also update pyproject.toml:version diff --git a/wcwidth/_constants.py b/wcwidth/_constants.py new file mode 100644 index 0000000..f414a97 --- /dev/null +++ b/wcwidth/_constants.py @@ -0,0 +1,52 @@ +"""Shared data tables and constants for wcwidth.py, _wcwidth.py, and _wcswidth.py.""" +# local +from .table_mc import CATEGORY_MC +from .table_wide import WIDE_EASTASIAN +from .table_zero import ZERO_WIDTH +from .table_grapheme import EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR +from .table_ambiguous import AMBIGUOUS_EASTASIAN +from .unicode_versions import list_versions + +_REGIONAL_INDICATOR_SET = frozenset( + range(GRAPHEME_REGIONAL_INDICATOR[0][0], GRAPHEME_REGIONAL_INDICATOR[0][1] + 1) +) +_ISC_VIRAMA_SET = frozenset(( + 0x094D, # DEVANAGARI SIGN VIRAMA + 0x09CD, # BENGALI SIGN VIRAMA + 0x0A4D, # GURMUKHI SIGN VIRAMA + 0x0ACD, # GUJARATI SIGN VIRAMA + 0x0B4D, # ORIYA SIGN VIRAMA + 0x0BCD, # TAMIL SIGN VIRAMA + 0x0C4D, # TELUGU SIGN VIRAMA + 0x0CCD, # KANNADA SIGN VIRAMA + 0x0D4D, # MALAYALAM SIGN VIRAMA + 0x0DCA, # SINHALA SIGN AL-LAKUNA + 0x1B44, # BALINESE ADEG ADEG + 0xA806, # SYLOTI NAGRI SIGN HASANTA + 0xA8C4, # SAURASHTRA SIGN VIRAMA + 0xA9C0, # JAVANESE PANGKON + 0x11046, # BRAHMI VIRAMA + 0x110B9, # KAITHI SIGN VIRAMA + 0x111C0, # SHARADA SIGN VIRAMA + 0x11235, # KHOJKI SIGN VIRAMA + 0x1134D, # GRANTHA SIGN VIRAMA + 0x11442, # NEWA SIGN VIRAMA + 0x114C2, # TIRHUTA SIGN VIRAMA + 0x115BF, # SIDDHAM SIGN VIRAMA + 0x1163F, # MODI SIGN VIRAMA + 0x116B6, # TAKRI SIGN VIRAMA + 0x11839, # DOGRA SIGN VIRAMA + 0x119E0, # NANDINAGARI SIGN VIRAMA + 0x11C3F, # BHAIKSUKI SIGN VIRAMA +)) +# pylint: disable=invalid-name +_LATEST_VERSION = list_versions()[-1] +_CATEGORY_MC_TABLE = CATEGORY_MC[_LATEST_VERSION] +_EMOJI_ZWJ_SET = frozenset( + cp for lo, hi in EXTENDED_PICTOGRAPHIC for cp in range(lo, hi + 1) +) | _REGIONAL_INDICATOR_SET +_FITZPATRICK_RANGE = (0x1F3FB, 0x1F3FF) + +_ZERO_WIDTH_TABLE = ZERO_WIDTH[_LATEST_VERSION] +_WIDE_EASTASIAN_TABLE = WIDE_EASTASIAN[_LATEST_VERSION] +_AMBIGUOUS_TABLE = AMBIGUOUS_EASTASIAN[next(iter(AMBIGUOUS_EASTASIAN))] diff --git a/wcwidth/_wcswidth.py b/wcwidth/_wcswidth.py new file mode 100644 index 0000000..82a056a --- /dev/null +++ b/wcwidth/_wcswidth.py @@ -0,0 +1,152 @@ +"""This is a python implementation of wcswidth().""" +# std imports +import typing + +# local +from ._wcwidth import wcwidth +from .bisearch import bisearch +from ._constants import (_EMOJI_ZWJ_SET, + _ISC_VIRAMA_SET, + _CATEGORY_MC_TABLE, + _FITZPATRICK_RANGE, + _REGIONAL_INDICATOR_SET) +from .table_vs16 import VS16_NARROW_TO_WIDE +from .table_grapheme import ISC_CONSONANT + + +def wcswidth( + pwcs: str, + n: typing.Union[int, None] = None, + unicode_version: str = 'auto', + ambiguous_width: int = 1, +) -> int: + """ + Given a unicode string, return its printable length on a terminal. + + See :ref:`Specification` for details of cell measurement. + + This implementation differs from Markus Khun's original POSIX C implementation, in that this + ``wcswidth()`` processes graphemes strings yielded by :func:`wcwidth.iter_graphemes` defined by + `Unicode Standard Annex #29`_. POSIX wcswidth(3) is not grapheme-aware and does not measure many + kinds of Emojis or complex scripts correctly. + + :param pwcs: Measure width of given unicode string. + :param n: When ``n`` is None (default), return the length of the entire + string, otherwise only the first ``n`` characters are measured. + + :param unicode_version: Ignored. Retained for backwards compatibility. + + .. deprecated:: 0.3.0 + Only the latest Unicode version is now shipped. + + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :returns: The width, in cells, needed to display the first ``n`` characters + of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control + characters! + + .. _`Unicode Standard Annex #29`: https://www.unicode.org/reports/tr29/ + """ + # pylint: disable=unused-argument,too-many-locals,too-many-statements + # pylint: disable=too-complex,too-many-branches,duplicate-code + # This function intentionally kept long without delegating functions to reduce function calls in + # "hot path", the overhead per-character adds up. + + # Fast path: pure ASCII printable strings are always width == length + if n is None and pwcs.isascii() and pwcs.isprintable(): + return len(pwcs) + + # Select wcwidth call pattern for best lru_cache performance: + # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls + # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) + _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) + + end = len(pwcs) if n is None else n + total_width = 0 + idx = 0 + last_measured_idx = -2 # Track index of last measured char for VS16 + last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check) + last_was_virama = False # Virama conjunct formation state + conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc) + while idx < end: + char = pwcs[idx] + ucs = ord(char) + if ucs == 0x200D: + if last_was_virama: + # ZWJ after virama requests explicit half-form rendering but + # does not change cell count — consume ZWJ only, let the next + # consonant be handled by the virama conjunct rule. + idx += 1 + elif idx + 1 < end: + # Emoji ZWJ: skip next character unconditionally. + idx += 2 + last_was_virama = False + else: + idx += 1 + last_was_virama = False + continue + if ucs == 0xFE0F and last_measured_idx >= 0: + # VS16 following a measured character: add 1 if that character is + # known to be converted from narrow to wide by VS16. + total_width += bisearch(ord(pwcs[last_measured_idx]), VS16_NARROW_TO_WIDE["9.0.0"]) + last_measured_idx = -2 # Prevent double application + # VS16 preserves emoji context: last_measured_ucs stays as the base + idx += 1 + continue + # Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+) + if ucs > 0xFFFF: + if ucs in _REGIONAL_INDICATOR_SET: + # Lazy RI pairing: count preceding consecutive RIs only when the last one is + # received, because RI's are received so rarely its better than per-loop tracking of + # 'last char was an RI'. + ri_before = 0 + j = idx - 1 + while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET: + ri_before += 1 + j -= 1 + if ri_before % 2 == 1: + # Second RI in pair: contributes 0 (pair = one 2-cell flag) using an even-or-odd + # check to determine, 'CAUS' would be two flags, but 'CAU' would be 1 flag + # and wide 'U'. + idx += 1 + last_measured_ucs = ucs + continue + # First or unpaired RI: measured normally (width 2 from table) + # Fitzpatrick modifier: zero-width when following emoji base + elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] + and last_measured_ucs in _EMOJI_ZWJ_SET): + idx += 1 + continue + # Virama conjunct formation: consonant following virama contributes 0 width. + # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category + if last_was_virama and bisearch(ucs, ISC_CONSONANT): + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + conjunct_pending = True + idx += 1 + continue + wcw = _wcwidth(char) + if wcw < 0: + # early return -1 on C0 and C1 control characters + return wcw + if wcw > 0: + if conjunct_pending: + total_width += 1 + conjunct_pending = False + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + elif last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE): + # Spacing Combining Mark (Mc) following a base character adds 1 + wcw = 1 + last_measured_idx = -2 + last_was_virama = False + conjunct_pending = False + else: + last_was_virama = ucs in _ISC_VIRAMA_SET + total_width += wcw + idx += 1 + if conjunct_pending: + total_width += 1 + return total_width diff --git a/wcwidth/_wcwidth.py b/wcwidth/_wcwidth.py new file mode 100644 index 0000000..0403b32 --- /dev/null +++ b/wcwidth/_wcwidth.py @@ -0,0 +1,63 @@ +"""This is a python implementation of wcwidth().""" +# std +# std imports +from functools import lru_cache + +# local +from .bisearch import bisearch +from ._constants import _AMBIGUOUS_TABLE, _ZERO_WIDTH_TABLE, _WIDE_EASTASIAN_TABLE + +# maxsize=1024: western scripts need ~64 unique codepoints per session, but +# CJK sessions may use ~2000 of ~3500 common hanzi/kanji. 1024 accommodates +# heavy CJK use. Performance floor at 32; bisearch is ~100ns per miss. + + +@lru_cache(maxsize=1024) +def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> int: # pylint: disable=unused-argument + r""" + Given one Unicode codepoint, return its printable length on a terminal. + + :param wc: A single Unicode character. + :param unicode_version: Ignored. Retained for backwards compatibility. + + .. deprecated:: 0.3.0 + Only the latest Unicode version is now shipped. + + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts + where ambiguous characters display as double-width. See + :ref:`ambiguous_width` for details. + :returns: The width, in cells, necessary to display the character of + Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has + no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is + not printable, or has an indeterminate effect on the terminal, such as + a control character. Otherwise, the number of column positions the + character occupies on a graphic terminal (1 or 2) is returned. + + See :ref:`Specification` for details of cell measurement. + """ + ucs = ord(wc) if wc else 0 + + # small optimization: early return of 1 for printable ASCII, this provides + # approximately 40% performance improvement for mostly-ascii documents, with + # less than 1% impact to others. + if 32 <= ucs < 0x7f: + return 1 + + # C0/C1 control characters are -1 for compatibility with POSIX-like calls + if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0: + return -1 + + # Zero width + if bisearch(ucs, _ZERO_WIDTH_TABLE): + return 0 + + # Wide (F/W categories) + if bisearch(ucs, _WIDE_EASTASIAN_TABLE): + return 2 + + # Ambiguous width (A category) - only when ambiguous_width=2 + if ambiguous_width == 2 and bisearch(ucs, _AMBIGUOUS_TABLE): + return 2 + + return 1 diff --git a/wcwidth/_width.py b/wcwidth/_width.py new file mode 100644 index 0000000..f04ce68 --- /dev/null +++ b/wcwidth/_width.py @@ -0,0 +1,300 @@ +"""This is a high-level width() supporting terminal output.""" + +from typing import Literal + +# local +from ._wcwidth import wcwidth +from .bisearch import bisearch +from ._wcswidth import wcswidth +from ._constants import (_EMOJI_ZWJ_SET, + _ISC_VIRAMA_SET, + _CATEGORY_MC_TABLE, + _FITZPATRICK_RANGE, + _REGIONAL_INDICATOR_SET) +from .table_vs16 import VS16_NARROW_TO_WIDE +from .text_sizing import TextSizing +from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL +from .table_grapheme import ISC_CONSONANT +from .escape_sequences import (ZERO_WIDTH_PATTERN, + TEXT_SIZING_PATTERN, + CURSOR_LEFT_SEQUENCE, + CURSOR_RIGHT_SEQUENCE, + INDETERMINATE_EFFECT_SEQUENCE, + strip_sequences) + +# In 'parse' mode, strings longer than this are checked for cursor-movement +# controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to +# 'ignore' to skip character-by-character parsing. The detection scan cost is +# negligible for long strings but wasted on short ones like labels or headings. +_WIDTH_FAST_PATH_MIN_LEN = 20 + +# Translation table to strip C0/C1 control characters for fast 'ignore' mode. +_CONTROL_CHAR_TABLE = str.maketrans('', '', ( + ''.join(chr(c) for c in range(0x00, 0x20)) + # C0: NUL through US (including tab) + '\x7f' + # DEL + ''.join(chr(c) for c in range(0x80, 0xa0)) # C1: U+0080-U+009F +)) + + +def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int: + """ + Fast path for width() with control_codes='ignore'. + + Strips escape sequences and control characters, then measures remaining text. + """ + return wcswidth( + strip_sequences(text).translate(_CONTROL_CHAR_TABLE), + ambiguous_width=ambiguous_width + ) + + +def width( + text: str, + *, + control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', + tabsize: int = 8, + ambiguous_width: int = 1, +) -> int: + r""" + Return printable width of text containing many kinds of control codes and sequences. + + Unlike :func:`wcswidth`, this function handles most control characters and many popular terminal + output sequences. Never returns -1. + + :param text: String to measure. + :param control_codes: How to handle control characters and sequences: + + - ``'parse'`` (default): Track horizontal cursor movement like BS ``\b``, CR ``\r``, TAB + ``\t``, cursor left and right movement sequences. Vertical movement (LF, VT, FF) and + indeterminate terminal sequences are zero-width. OSC 66 Kitty Text Sizing protocol, OSC 8 + Hyperlink, and many other kinds of output sequences are parsed for displayed measurements. + - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with + indeterminate results of the screen or cursor, like clear or vertical movement. Generally, + these should be handled with a virtual terminal emulator (like 'pyte'). + - ``'ignore'``: All C0 and C1 control characters and escape sequences are measured as + width 0. This is the fastest measurement for text already filtered or known not to contain + any kinds of control codes or sequences. TAB ``\t`` is zero-width; to ensure + tab expansion, pre-process text using :func:`str.expandtabs`. + + :param tabsize: Tab stop width for ``'parse'`` and ``'strict'`` modes. Default is 8. + Must be positive. Has no effect when ``control_codes='ignore'``. + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :returns: Maximum cursor position reached, "extent", accounting for cursor movement sequences + present in ``text`` according to given parameters. This represents the rightmost column the + cursor reaches. Always a non-negative integer. + + :raises ValueError: If ``control_codes='strict'`` and control characters with indeterminate + effects, such as vertical movement or clear sequences are encountered, or on unexpected + C0 or C1 control code. Also raised when ``control_codes`` is not one of the valid values. + + .. versionadded:: 0.3.0 + + Examples:: + + >>> width('hello') + 5 + >>> width('コンニチハ') + 10 + >>> width('\x1b[31mred\x1b[0m') + 3 + >>> width('\x1b[31mred\x1b[0m', control_codes='ignore') # same result (ignored) + 3 + >>> width('123\b4') # backspace overwrites previous cell (outputs '124') + 3 + >>> width('abc\t') # tab caused cursor to move to column 8 + 8 + >>> width('1\x1b[10C') # '1' + cursor right 10, cursor ends on column 11 + 11 + >>> width('1\x1b[10C', control_codes='ignore') # faster but wrong in this case + 1 + """ + # pylint: disable=too-complex,too-many-branches,too-many-statements,too-many-locals + # This could be broken into sub-functions (#1, #3, and #6 especially), but for reduced overhead + # in consideration of this function a likely "hot path", they are inline, breaking many pylint + # complexity rules. + + # Fast path for ASCII printable (no tabs, escapes, or control chars) + if text.isascii() and text.isprintable(): + return len(text) + + # Fast parse: if no horizontal cursor movements are possible, switch to 'ignore' mode. + # Only check longer strings - the detection overhead hurts short string performance. + if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN: + # Check for cursor-affecting control characters + if '\b' not in text and '\t' not in text and '\r' not in text: + # Check for escape sequences that can't be ignored, if present + if '\x1b' not in text or ( + not CURSOR_RIGHT_SEQUENCE.search(text) and + not CURSOR_LEFT_SEQUENCE.search(text) and + not TEXT_SIZING_PATTERN.search(text) + ): + control_codes = 'ignore' + + # Fast path for ignore mode, useful if you know the text is already free of control codes + if control_codes == 'ignore': + return _width_ignored_codes(text, ambiguous_width) + + strict = control_codes == 'strict' + # Track absolute positions: tab stops need modulo on absolute column, CR resets to 0. + # Initialize max_extent to 0 so backward movement (CR, BS) won't yield negative width. + current_col = 0 + max_extent = 0 + idx = 0 + last_measured_idx = -2 # Track index of last measured char for VS16; -2 can never match idx-1 + last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check) + last_was_virama = False # Virama conjunct formation state + conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc) + text_len = len(text) + + # Select wcwidth call pattern for best lru_cache performance: + # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls + # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) + _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) + + while idx < text_len: + char = text[idx] + + # 1. Handle ESC sequences + if char == '\x1b': + # 1a. OSC 66 (kitty text sizing) positive width + if (ts_match := TEXT_SIZING_PATTERN.match(text, idx)): + text_size = TextSizing.from_match(ts_match, control_codes=control_codes) + current_col += text_size.display_width(ambiguous_width) + max_extent = max(max_extent, current_col) + idx = ts_match.end() + continue + # 1b. Check all other "zero-width" terminal sequences + match = ZERO_WIDTH_PATTERN.match(text, idx) + if match: + seq = match.group() + if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): + raise ValueError(f"Indeterminate cursor sequence at position {idx}") + # Apply cursor movement + right = CURSOR_RIGHT_SEQUENCE.match(seq) + if right: + current_col += int(right.group(1) or 1) + else: + left = CURSOR_LEFT_SEQUENCE.match(seq) + if left: + current_col = max(0, current_col - int(left.group(1) or 1)) + idx = match.end() + else: + idx += 1 + max_extent = max(max_extent, current_col) + continue + + # 2. Handle illegal and vertical control characters (zero width, error in strict) + if char in ILLEGAL_CTRL: + if strict: + raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}") + idx += 1 + continue + + if char in VERTICAL_CTRL: + if strict: + raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}") + idx += 1 + continue + + # 3. Handle horizontal movement characters + if char in HORIZONTAL_CTRL: + if char == '\x09' and tabsize > 0: # Tab + current_col += tabsize - (current_col % tabsize) + elif char == '\x08': # Backspace + if current_col > 0: + current_col -= 1 + elif char == '\x0d': # Carriage return + current_col = 0 + max_extent = max(max_extent, current_col) + idx += 1 + continue + + # 4. Handle ZWJ + if char == '\u200D': + if last_was_virama: + # ZWJ after virama requests explicit half-form rendering but + # does not change cell count — consume ZWJ only, let the next + # consonant be handled by the virama conjunct rule. + idx += 1 + elif idx + 1 < text_len: + # Emoji ZWJ: skip next character unconditionally. + idx += 2 + last_was_virama = False + else: + idx += 1 + last_was_virama = False + continue + + # 5. Handle other zero-width characters (control chars) + if char in ZERO_WIDTH_CTRL: + idx += 1 + continue + + ucs = ord(char) + + # 6. Handle VS16: converts preceding narrow character to wide + if ucs == 0xFE0F: + if last_measured_idx == idx - 1: + if bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE["9.0.0"]): + current_col += 1 + max_extent = max(max_extent, current_col) + # VS16 preserves emoji context: last_measured_ucs stays as the base + idx += 1 + continue + + # 6b. Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+) + if ucs > 0xFFFF: + if ucs in _REGIONAL_INDICATOR_SET: + # Lazy RI pairing: count preceding consecutive RIs + ri_before = 0 + j = idx - 1 + while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET: + ri_before += 1 + j -= 1 + if ri_before % 2 == 1: + last_measured_ucs = ucs + idx += 1 + continue + # 6c. Fitzpatrick modifier: zero-width when following emoji base + elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] + and last_measured_ucs in _EMOJI_ZWJ_SET): + idx += 1 + continue + + # 7. Virama conjunct formation: consonant following virama contributes 0 width. + # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category + if last_was_virama and bisearch(ucs, ISC_CONSONANT): + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + conjunct_pending = True + idx += 1 + continue + + # 8. Normal characters: measure with wcwidth + w = _wcwidth(char) + if w > 0: + if conjunct_pending: + current_col += 1 + conjunct_pending = False + current_col += w + max_extent = max(max_extent, current_col) + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + elif last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE): + # Spacing Combining Mark (Mc) following a base character adds 1 + current_col += 1 + max_extent = max(max_extent, current_col) + last_measured_idx = -2 + last_was_virama = False + conjunct_pending = False + else: + last_was_virama = ucs in _ISC_VIRAMA_SET + idx += 1 + + if conjunct_pending: + current_col += 1 + max_extent = max(max_extent, current_col) + return max_extent diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index 610f1aa..9bea12d 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -7,6 +7,7 @@ """ # std imports import re +import typing # Text Sizing Protocol (OSC 66) — has positive width, must be checked before ZERO_WIDTH_PATTERN. # Groups: (1) metadata, (2) inner text, (3) terminator (BEL or ST). @@ -74,3 +75,87 @@ r'\x1bM', # scroll_reverse (reverse index) )) ) + + +def iter_sequences(text: str) -> typing.Iterator[typing.Tuple[str, bool]]: + r""" + Iterate through text, yielding segments with sequence identification. + + This generator yields tuples of ``(segment, is_sequence)`` for each part + of the input text, where ``is_sequence`` is ``True`` if the segment is + a recognized terminal escape sequence. + + :param text: String to iterate through. + :returns: Iterator of (segment, is_sequence) tuples. + + .. versionadded:: 0.3.0 + + Example:: + + >>> list(iter_sequences('hello')) + [('hello', False)] + >>> list(iter_sequences('\x1b[31mred')) + [('\x1b[31m', True), ('red', False)] + >>> list(iter_sequences('\x1b[1m\x1b[31m')) + [('\x1b[1m', True), ('\x1b[31m', True)] + """ + idx = 0 + text_len = len(text) + segment_start = 0 + + while idx < text_len: + char = text[idx] + + if char == '\x1b': + # Yield any accumulated non-sequence text + if idx > segment_start: + yield (text[segment_start:idx], False) + + # Try to match an escape sequence + match = ZERO_WIDTH_PATTERN.match(text, idx) + if match: + yield (match.group(), True) + idx = match.end() + else: + # Lone ESC or unrecognized - yield as sequence anyway + yield (char, True) + idx += 1 + segment_start = idx + else: + idx += 1 + + # Yield any remaining text + if segment_start < text_len: + yield (text[segment_start:], False) + + +def strip_sequences(text: str) -> str: + r""" + Return text with all terminal escape sequences removed. + + For sequences containing printable text, OSC 66 (Text sizing protocol) and OSC 8 (hyperlink), + the inner text is preserved. + + Unknown or incomplete ESC sequences are preserved. + + :param text: String that may contain terminal escape sequences. + :returns: The input text with all escape sequences stripped. + + .. versionadded:: 0.3.0 + + Example:: + + >>> strip_sequences('\x1b[31mred\x1b[0m') + 'red' + >>> strip_sequences('hello') + 'hello' + >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text') + 'bold red text' + >>> strip_sequences('\x1b]66;s=2;hello\x07') + 'hello' + >>> strip_sequences('\x1b]8;id=34;https://example.com\x1b\\[view]\x1b]8;;\x1b\\') + '[view]' + """ + if '\x1b]66;' in text: + text = TEXT_SIZING_PATTERN.sub(r'\2', text) + return ZERO_WIDTH_PATTERN.sub('', text) diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py index 555f0a5..9427a2c 100644 --- a/wcwidth/text_sizing.py +++ b/wcwidth/text_sizing.py @@ -1,7 +1,7 @@ r""" -`Kitty Text Sizing Protocol`_ (OSC 66) parsing and measurement. +`kitty text sizing protocol`_ (OSC 66) parsing and measurement. -The `Kitty Text Sizing Protocol`_ allows terminal apps to explicitly tell +The kitty text sizing protocol allows terminal apps to explicitly tell terminals how many cells text occupies, using the escape sequence:: ESC ] 66 ; metadata ; text BEL/ST @@ -24,15 +24,17 @@ and have no effect on measurements made in this library. .. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ -.. versionadded:: 0.6.0 +.. versionadded:: 0.6.1 """ from __future__ import annotations +# std imports +import re + import typing -if typing.TYPE_CHECKING: # pragma: no cover - # std imports - import re +# local +from ._wcswidth import wcswidth class _FieldMeta(typing.NamedTuple): @@ -41,6 +43,15 @@ class _FieldMeta(typing.NamedTuple): high: int +TEXT_FIELD_MAPPING: dict[str, _FieldMeta] = { + 's': _FieldMeta(name='scale', low=1, high=7), + 'w': _FieldMeta(name='width', low=0, high=7), + 'n': _FieldMeta(name='numerator', low=0, high=15), + 'd': _FieldMeta(name='denominator', low=0, high=15), + 'v': _FieldMeta(name='vertical_align', low=0, high=2), + 'h': _FieldMeta(name='horizontal_align', low=0, high=2)} + + class TextSizingParams(typing.NamedTuple): """ Parsed parameters from a text sizing escape sequence (OSC 66). @@ -54,6 +65,7 @@ class TextSizingParams(typing.NamedTuple): :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). :param horizontal_align: Horizontal alignment (0=left, 1=right, 2=center). """ + scale: int = 1 width: int = 0 numerator: int = 0 @@ -61,19 +73,12 @@ class TextSizingParams(typing.NamedTuple): vertical_align: int = 0 horizontal_align: int = 0 - FIELD_MAPPING = {'s': _FieldMeta(name='scale', low=1, high=7), - 'w': _FieldMeta(name='width', low=0, high=7), - 'n': _FieldMeta(name='numerator', low=0, high=15), - 'd': _FieldMeta(name='denominator', low=0, high=15), - 'v': _FieldMeta(name='vertical_align', low=0, high=2), - 'h': _FieldMeta(name='horizontal_align', low=0, high=2)} - def make_sequence(self) -> str: """Build and return sub-part of an OSC 66 sequence.""" parts = [] default_params = TextSizingParams() # build string for all known parameters of non-default values - for field_key, field in self.FIELD_MAPPING.items(): + for field_key, field in TEXT_FIELD_MAPPING.items(): val = getattr(self, field.name) default_val = getattr(default_params, field.name) if val != default_val: @@ -94,22 +99,23 @@ def from_params(cls, raw: str, control_codes: str = 'parse') -> TextSizingParams Example:: - >>> _parse_text_sizing_params('s=2:w=3') - TextSizingParams(scale=2, width=3, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) - >>> _parse_text_sizing_params('') - TextSizingParams(scale=1, width=0, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) + >>> TextSizingParams.from_params('s=2:w=3') + TextSizingParams(scale=2, width=3, numerator=0, denominator=0, \ + vertical_align=0, horizontal_align=0) """ - kwargs: dict[str, int] = {} + kwargs: typing.Dict[str, int] = {} for part in raw.split(':'): if '=' not in part: if control_codes == 'strict': - raise ValueError(f"Expected '=' in text sizing parameter (key=val), got {part!r}") + raise ValueError(f"Expected '=' in text sizing parameter (key=val), " + f"got {part!r}") continue key, _eq, val = part.partition('=') - field = TextSizingParams.FIELD_MAPPING.get(key) + field = TEXT_FIELD_MAPPING.get(key) if field is None: if control_codes == 'strict': - raise ValueError(f"Unknown text sizing field '{key}' in OSC 66 sequence, {raw!r}") + raise ValueError(f"Unknown text sizing field '{key}' " + f"in OSC 66 sequence, {raw!r}") # ignore unknown fields unless 'strict' continue try: @@ -123,23 +129,26 @@ def from_params(cls, raw: str, control_codes: str = 'parse') -> TextSizingParams if control_codes == 'strict' and (value > field.high or value < field.low): raise ValueError(f"Out of bounds text sizing value '{val}' " f"in OSC 66 sequence, {raw!r}: " - f"allowed range for '{key}' ({field.name}) is {field.low} to {field.high}") + f"allowed range for '{key}' ({field.name}) " + f"is {field.low} to {field.high}") kwargs[field.name] = max(field.low, min(field.high, value)) return cls(**kwargs) class TextSizing(typing.NamedTuple): + """Basic horizontal width measurement for kitty text sizing protocol.""" + params: TextSizingParams text: str terminator: str @classmethod - def from_match(cls, match: re.Match, control_codes='parse') -> TextSizing: - """ + def from_match(cls, match: re.Match[str], control_codes: str = 'parse') -> TextSizing: + r""" Parse using matching OSC 66 Sequence. :param match: match object from :attr:`wcwidth.escape_sequences.TEXT_SIZING_PATTERN`. - :param control_codes: 'parse' or 'strict', same meaning and delegated by + :param control_codes: 'parse' or 'strict', same meaning as delegated by :func:`wcwidth.width`. :raises ValueError: When ``control_codes='strict'`` for unrecognized, invalid, or out of bounds text sizing parameters. @@ -147,10 +156,10 @@ def from_match(cls, match: re.Match, control_codes='parse') -> TextSizing: Example:: - >>> _parse_text_sizing_params('s=2:w=3') - TextSizingParams(scale=2, width=3, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) - >>> _parse_text_sizing_params('') - TextSizingParams(scale=1, width=0, numerator=0, denominator=0, vertical_align=0, horizontal_align=0) + from wcwidth.escape_sequences import TEXT_SIZING_PATTERN + >>> TextSizing.from_match(TEXT_SIZING_PATTERN.match('\x1b]66;w=2;XY\x07')) + TextSizing(params=TextSizingParams(scale=1, width=2, numerator=0, denominator=0, \ + vertical_align=0, horizontal_align=0), text='XY', terminator='\x07') """ return cls(params=TextSizingParams.from_params(match.group(1), control_codes=control_codes), text=match.group(2), @@ -168,10 +177,7 @@ def display_width(self, ambiguous_width: int = 1) -> int: """ if self.params.width > 0: return self.params.scale * self.params.width - # pylint: disable=import-outside-toplevel - # local - import wcwidth # Lazy import to avoid circular dependency - w = wcwidth.wcswidth(self.text, ambiguous_width=ambiguous_width) + w = wcswidth(self.text, ambiguous_width=ambiguous_width) return self.params.scale * max(0, w) def make_sequence(self) -> str: diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index 5f53715..b471723 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -14,11 +14,9 @@ from typing import TYPE_CHECKING, NamedTuple # local -from .wcwidth import width as wcwidth_width -from .wcwidth import iter_sequences from .grapheme import iter_graphemes from .sgr_state import propagate_sgr as _propagate_sgr -from .escape_sequences import ZERO_WIDTH_PATTERN +from .escape_sequences import ZERO_WIDTH_PATTERN, iter_sequences if TYPE_CHECKING: # pragma: no cover from typing import Any, Literal @@ -99,8 +97,11 @@ def _next_hyperlink_id() -> str: def _width(self, text: str) -> int: """Measure text width accounting for sequences.""" - return wcwidth_width(text, control_codes=self.control_codes, tabsize=self.tabsize, - ambiguous_width=self.ambiguous_width) + # pylint: disable=import-outside-toplevel + # local + from .wcwidth import width + return width(text, control_codes=self.control_codes, tabsize=self.tabsize, + ambiguous_width=self.ambiguous_width) def _strip_sequences(self, text: str) -> str: """Strip all terminal sequences from text.""" diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 911b4a4..48383d0 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -3,9 +3,7 @@ https://github.com/jquast/wcwidth -from Markus Kuhn's C code, retrieved from: - - http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c +Derived from Markus Kuhn's C code, This is an implementation of wcwidth() and wcswidth() (defined in IEEE Std 1002.1-2001) for Unicode. @@ -66,97 +64,46 @@ # std imports from functools import lru_cache -from typing import TYPE_CHECKING +from typing import Literal # local +# pylint: disable=unused-import +# Some CONSTANTS imported are now unused, like _wcversion_value(), they were first defined in this +# file location, and remain there for API compatibility purposes _wcversion_value and +# _wcmatch_version are no longer used internally since version 0.5.0 (only the latest Unicode +# version is shipped), and many global constants, now unused here, were moved to _constants.py in +# version 0.6.1. +# +# They are retained for API compatibility with external tools like ucs-detect +# that may use these private functions. +# +from ._width import width +from ._wcwidth import wcwidth from .bisearch import bisearch as _bisearch from .grapheme import iter_graphemes -from .table_mc import CATEGORY_MC +from ._wcswidth import wcswidth from .sgr_state import (_SGR_PATTERN, _SGR_STATE_DEFAULT, _sgr_state_update, _sgr_state_is_active, _sgr_state_to_sequence) +from ._constants import _LATEST_VERSION from .table_vs16 import VS16_NARROW_TO_WIDE from .table_wide import WIDE_EASTASIAN from .table_zero import ZERO_WIDTH from .text_sizing import TextSizing from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL -from .table_grapheme import ISC_CONSONANT, EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR +from .table_grapheme import ISC_CONSONANT from .table_ambiguous import AMBIGUOUS_EASTASIAN from .escape_sequences import (ZERO_WIDTH_PATTERN, TEXT_SIZING_PATTERN, CURSOR_LEFT_SEQUENCE, CURSOR_RIGHT_SEQUENCE, - INDETERMINATE_EFFECT_SEQUENCE) + INDETERMINATE_EFFECT_SEQUENCE, + iter_sequences, + strip_sequences) from .unicode_versions import list_versions -if TYPE_CHECKING: # pragma: no cover - # std imports - from collections.abc import Iterator - - from typing import Literal - -# Pre-compute table references for the latest (and only) Unicode version. -_LATEST_VERSION = list_versions()[-1] -_ZERO_WIDTH_TABLE = ZERO_WIDTH[_LATEST_VERSION] -_WIDE_EASTASIAN_TABLE = WIDE_EASTASIAN[_LATEST_VERSION] -_AMBIGUOUS_TABLE = AMBIGUOUS_EASTASIAN[next(iter(AMBIGUOUS_EASTASIAN))] -_CATEGORY_MC_TABLE = CATEGORY_MC[_LATEST_VERSION] -_REGIONAL_INDICATOR_SET = frozenset( - range(GRAPHEME_REGIONAL_INDICATOR[0][0], GRAPHEME_REGIONAL_INDICATOR[0][1] + 1) -) -_EMOJI_ZWJ_SET = frozenset( - cp for lo, hi in EXTENDED_PICTOGRAPHIC for cp in range(lo, hi + 1) -) | _REGIONAL_INDICATOR_SET -_FITZPATRICK_RANGE = (0x1F3FB, 0x1F3FF) -# Indic_Syllabic_Category=Virama codepoints, from IndicSyllabicCategory.txt. -# These are structurally tied to their scripts and not expected to change. -# https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt -_ISC_VIRAMA_SET = frozenset(( - 0x094D, # DEVANAGARI SIGN VIRAMA - 0x09CD, # BENGALI SIGN VIRAMA - 0x0A4D, # GURMUKHI SIGN VIRAMA - 0x0ACD, # GUJARATI SIGN VIRAMA - 0x0B4D, # ORIYA SIGN VIRAMA - 0x0BCD, # TAMIL SIGN VIRAMA - 0x0C4D, # TELUGU SIGN VIRAMA - 0x0CCD, # KANNADA SIGN VIRAMA - 0x0D4D, # MALAYALAM SIGN VIRAMA - 0x0DCA, # SINHALA SIGN AL-LAKUNA - 0x1B44, # BALINESE ADEG ADEG - 0xA806, # SYLOTI NAGRI SIGN HASANTA - 0xA8C4, # SAURASHTRA SIGN VIRAMA - 0xA9C0, # JAVANESE PANGKON - 0x11046, # BRAHMI VIRAMA - 0x110B9, # KAITHI SIGN VIRAMA - 0x111C0, # SHARADA SIGN VIRAMA - 0x11235, # KHOJKI SIGN VIRAMA - 0x1134D, # GRANTHA SIGN VIRAMA - 0x11442, # NEWA SIGN VIRAMA - 0x114C2, # TIRHUTA SIGN VIRAMA - 0x115BF, # SIDDHAM SIGN VIRAMA - 0x1163F, # MODI SIGN VIRAMA - 0x116B6, # TAKRI SIGN VIRAMA - 0x11839, # DOGRA SIGN VIRAMA - 0x119E0, # NANDINAGARI SIGN VIRAMA - 0x11C3F, # BHAIKSUKI SIGN VIRAMA -)) -_ISC_CONSONANT_TABLE = ISC_CONSONANT - -# In 'parse' mode, strings longer than this are checked for cursor-movement -# controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to -# 'ignore' to skip character-by-character parsing. The detection scan cost is -# negligible for long strings but wasted on short ones like labels or headings. -_WIDTH_FAST_PATH_MIN_LEN = 20 - -# Translation table to strip C0/C1 control characters for fast 'ignore' mode. -_CONTROL_CHAR_TABLE = str.maketrans('', '', ( - ''.join(chr(c) for c in range(0x00, 0x20)) + # C0: NUL through US (including tab) - '\x7f' + # DEL - ''.join(chr(c) for c in range(0x80, 0xa0)) # C1: U+0080-U+009F -)) - # Unlike wcwidth.__all__, wcwidth.wcwidth.__all__ is NOT for the purpose of defining a public API, # or what we prefer to be imported with statement, "from wcwidth.wcwidth import *". Explicitly # re-export imports here for no other reason than to satisfy the type checkers (mypy). Yak shavings. @@ -180,200 +127,6 @@ ) -# maxsize=1024: western scripts need ~64 unique codepoints per session, but -# CJK sessions may use ~2000 of ~3500 common hanzi/kanji. 1024 accommodates -# heavy CJK use. Performance floor at 32; bisearch is ~100ns per miss. - -@lru_cache(maxsize=1024) -def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> int: # pylint: disable=unused-argument - r""" - Given one Unicode codepoint, return its printable length on a terminal. - - :param wc: A single Unicode character. - :param unicode_version: Ignored. Retained for backwards compatibility. - - .. deprecated:: 0.3.0 - Only the latest Unicode version is now shipped. - - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts - where ambiguous characters display as double-width. See - :ref:`ambiguous_width` for details. - :returns: The width, in cells, necessary to display the character of - Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has - no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is - not printable, or has an indeterminate effect on the terminal, such as - a control character. Otherwise, the number of column positions the - character occupies on a graphic terminal (1 or 2) is returned. - - See :ref:`Specification` for details of cell measurement. - """ - ucs = ord(wc) if wc else 0 - - # small optimization: early return of 1 for printable ASCII, this provides - # approximately 40% performance improvement for mostly-ascii documents, with - # less than 1% impact to others. - if 32 <= ucs < 0x7f: - return 1 - - # C0/C1 control characters are -1 for compatibility with POSIX-like calls - if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0: - return -1 - - # Zero width - if _bisearch(ucs, _ZERO_WIDTH_TABLE): - return 0 - - # Wide (F/W categories) - if _bisearch(ucs, _WIDE_EASTASIAN_TABLE): - return 2 - - # Ambiguous width (A category) - only when ambiguous_width=2 - if ambiguous_width == 2 and _bisearch(ucs, _AMBIGUOUS_TABLE): - return 2 - - return 1 - - -def wcswidth( - pwcs: str, - n: int | None = None, - unicode_version: str = 'auto', - ambiguous_width: int = 1, -) -> int: - """ - Given a unicode string, return its printable length on a terminal. - - :param pwcs: Measure width of given unicode string. - :param n: When ``n`` is None (default), return the length of the entire - string, otherwise only the first ``n`` characters are measured. - - :param unicode_version: Ignored. Retained for backwards compatibility. - - .. deprecated:: 0.3.0 - Only the latest Unicode version is now shipped. - - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :returns: The width, in cells, needed to display the first ``n`` characters - of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control - characters! - - See :ref:`Specification` for details of cell measurement. - """ - # pylint: disable=unused-argument,too-many-locals,too-many-statements - # pylint: disable=too-complex,too-many-branches - # This function intentionally kept long without delegating functions to reduce function calls in - # "hot path", the overhead per-character adds up. - - # Fast path: pure ASCII printable strings are always width == length - if n is None and pwcs.isascii() and pwcs.isprintable(): - return len(pwcs) - - # Select wcwidth call pattern for best lru_cache performance: - # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls - # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) - _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) - - end = len(pwcs) if n is None else n - total_width = 0 - idx = 0 - last_measured_idx = -2 # Track index of last measured char for VS16 - last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check) - last_was_virama = False # Virama conjunct formation state - conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc) - while idx < end: - char = pwcs[idx] - ucs = ord(char) - if ucs == 0x200D: - if last_was_virama: - # ZWJ after virama requests explicit half-form rendering but - # does not change cell count — consume ZWJ only, let the next - # consonant be handled by the virama conjunct rule. - idx += 1 - elif idx + 1 < end: - # Emoji ZWJ: skip next character unconditionally. - idx += 2 - last_was_virama = False - else: - idx += 1 - last_was_virama = False - continue - if ucs == 0xFE0F and last_measured_idx >= 0: - # VS16 following a measured character: add 1 if that character is - # known to be converted from narrow to wide by VS16. - total_width += _bisearch(ord(pwcs[last_measured_idx]), - VS16_NARROW_TO_WIDE["9.0.0"]) - last_measured_idx = -2 # Prevent double application - # VS16 preserves emoji context: last_measured_ucs stays as the base - idx += 1 - continue - # Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+) - if ucs > 0xFFFF: - if ucs in _REGIONAL_INDICATOR_SET: - # Lazy RI pairing: count preceding consecutive RIs only when the last one is - # received, because RI's are received so rarely its better than per-loop tracking of - # 'last char was an RI'. - ri_before = 0 - j = idx - 1 - while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET: - ri_before += 1 - j -= 1 - if ri_before % 2 == 1: - # Second RI in pair: contributes 0 (pair = one 2-cell flag) using an even-or-odd - # check to determine, 'CAUS' would be two flags, but 'CAU' would be 1 flag - # and wide 'U'. - idx += 1 - last_measured_ucs = ucs - continue - # First or unpaired RI: measured normally (width 2 from table) - # Fitzpatrick modifier: zero-width when following emoji base - elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] - and last_measured_ucs in _EMOJI_ZWJ_SET): - idx += 1 - continue - # Virama conjunct formation: consonant following virama contributes 0 width. - # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category - if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE): - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - conjunct_pending = True - idx += 1 - continue - wcw = _wcwidth(char) - if wcw < 0: - # early return -1 on C0 and C1 control characters - return wcw - if wcw > 0: - if conjunct_pending: - total_width += 1 - conjunct_pending = False - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE): - # Spacing Combining Mark (Mc) following a base character adds 1 - wcw = 1 - last_measured_idx = -2 - last_was_virama = False - conjunct_pending = False - else: - last_was_virama = ucs in _ISC_VIRAMA_SET - total_width += wcw - idx += 1 - if conjunct_pending: - total_width += 1 - return total_width - - -# NOTE: _wcversion_value and _wcmatch_version are no longer used internally -# by wcwidth since version 0.5.0 (only the latest Unicode version is shipped). -# -# They are retained for API compatibility with external tools like ucs-detect -# that may use these private functions. - - @lru_cache(maxsize=128) def _wcversion_value(ver_string: str) -> tuple[int, ...]: # pragma: no cover """ @@ -408,322 +161,6 @@ def _wcmatch_version(given_version: str) -> str: # pylint: disable=unused-argum return _LATEST_VERSION -def iter_sequences(text: str) -> Iterator[tuple[str, bool]]: - r""" - Iterate through text, yielding segments with sequence identification. - - This generator yields tuples of ``(segment, is_sequence)`` for each part - of the input text, where ``is_sequence`` is ``True`` if the segment is - a recognized terminal escape sequence. - - :param text: String to iterate through. - :returns: Iterator of (segment, is_sequence) tuples. - - .. versionadded:: 0.3.0 - - Example:: - - >>> list(iter_sequences('hello')) - [('hello', False)] - >>> list(iter_sequences('\x1b[31mred')) - [('\x1b[31m', True), ('red', False)] - >>> list(iter_sequences('\x1b[1m\x1b[31m')) - [('\x1b[1m', True), ('\x1b[31m', True)] - """ - idx = 0 - text_len = len(text) - segment_start = 0 - - while idx < text_len: - char = text[idx] - - if char == '\x1b': - # Yield any accumulated non-sequence text - if idx > segment_start: - yield (text[segment_start:idx], False) - - # Try to match an escape sequence - match = ZERO_WIDTH_PATTERN.match(text, idx) - if match: - yield (match.group(), True) - idx = match.end() - else: - # Lone ESC or unrecognized - yield as sequence anyway - yield (char, True) - idx += 1 - segment_start = idx - else: - idx += 1 - - # Yield any remaining text - if segment_start < text_len: - yield (text[segment_start:], False) - - -def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int: - """ - Fast path for width() with control_codes='ignore'. - - Strips escape sequences and control characters, then measures remaining text. - """ - return wcswidth( - strip_sequences(text).translate(_CONTROL_CHAR_TABLE), - ambiguous_width=ambiguous_width - ) - - -def width( - text: str, - *, - control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', - tabsize: int = 8, - ambiguous_width: int = 1, -) -> int: - r""" - Return printable width of text containing many kinds of control codes and sequences. - - Unlike :func:`wcswidth`, this function handles most control characters and many popular terminal - output sequences. Never returns -1. - - :param text: String to measure. - :param control_codes: How to handle control characters and sequences: - - - ``'parse'`` (default): Track horizontal cursor movement like BS ``\b``, CR ``\r``, TAB - ``\t``, cursor left and right movement sequences. Vertical movement (LF, VT, FF) and - indeterminate terminal sequences are zero-width. OSC 66 Kitty Text Sizing protocol, OSC 8 - Hyperlink, and many other kinds of output sequences are parsed for displayed measurements. - - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with - indeterminate results of the screen or cursor, like clear or vertical movement. Generally, - these should be handled with a virtual terminal emulator (like 'pyte'). - - ``'ignore'``: All C0 and C1 control characters and escape sequences are measured as - width 0. This is the fastest measurement for text already filtered or known not to contain - any kinds of control codes or sequences. TAB ``\t`` is zero-width; to ensure - tab expansion, pre-process text using :func:`str.expandtabs`. - - :param tabsize: Tab stop width for ``'parse'`` and ``'strict'`` modes. Default is 8. - Must be positive. Has no effect when ``control_codes='ignore'``. - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :returns: Maximum cursor position reached, "extent", accounting for cursor movement sequences - present in ``text`` according to given parameters. This represents the rightmost column the - cursor reaches. Always a non-negative integer. - - :raises ValueError: If ``control_codes='strict'`` and control characters with indeterminate - effects, such as vertical movement or clear sequences are encountered, or on unexpected - C0 or C1 control code. Also raised when ``control_codes`` is not one of the valid values. - - .. versionadded:: 0.3.0 - - Examples:: - - >>> width('hello') - 5 - >>> width('コンニチハ') - 10 - >>> width('\x1b[31mred\x1b[0m') - 3 - >>> width('\x1b[31mred\x1b[0m', control_codes='ignore') # same result (ignored) - 3 - >>> width('123\b4') # backspace overwrites previous cell (outputs '124') - 3 - >>> width('abc\t') # tab caused cursor to move to column 8 - 8 - >>> width('1\x1b[10C') # '1' + cursor right 10, cursor ends on column 11 - 11 - >>> width('1\x1b[10C', control_codes='ignore') # faster but wrong in this case - 1 - """ - # pylint: disable=too-complex,too-many-branches,too-many-statements,too-many-locals - # This could be broken into sub-functions (#1, #3, and #6 especially), but for reduced overhead - # in consideration of this function a likely "hot path", they are inline, breaking many pylint - # complexity rules. - - # Fast path for ASCII printable (no tabs, escapes, or control chars) - if text.isascii() and text.isprintable(): - return len(text) - - # Fast parse: if no horizontal cursor movements are possible, switch to 'ignore' mode. - # Only check longer strings - the detection overhead hurts short string performance. - if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN: - # Check for cursor-affecting control characters - if '\b' not in text and '\t' not in text and '\r' not in text: - # Check for escape sequences that can't be ignored, if present - if '\x1b' not in text or ( - not CURSOR_RIGHT_SEQUENCE.search(text) and - not CURSOR_LEFT_SEQUENCE.search(text) and - not TEXT_SIZING_PATTERN.search(text) - ): - control_codes = 'ignore' - - # Fast path for ignore mode, useful if you know the text is already free of control codes - if control_codes == 'ignore': - return _width_ignored_codes(text, ambiguous_width) - - strict = control_codes == 'strict' - # Track absolute positions: tab stops need modulo on absolute column, CR resets to 0. - # Initialize max_extent to 0 so backward movement (CR, BS) won't yield negative width. - current_col = 0 - max_extent = 0 - idx = 0 - last_measured_idx = -2 # Track index of last measured char for VS16; -2 can never match idx-1 - last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check) - last_was_virama = False # Virama conjunct formation state - conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc) - text_len = len(text) - - # Select wcwidth call pattern for best lru_cache performance: - # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls - # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) - _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) - - while idx < text_len: - char = text[idx] - - # 1. Handle ESC sequences - if char == '\x1b': - # 1a. OSC 66 (kitty text sizing) positive width - if (ts_match := TEXT_SIZING_PATTERN.match(text, idx)): - text_size = TextSizing.from_match(ts_match, control_codes=control_codes) - current_col += text_size.display_width(ambiguous_width) - max_extent = max(max_extent, current_col) - idx = ts_match.end() - continue - # 1b. Check all other "zero-width" terminal sequences - match = ZERO_WIDTH_PATTERN.match(text, idx) - if match: - seq = match.group() - if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): - raise ValueError(f"Indeterminate cursor sequence at position {idx}") - # Apply cursor movement - right = CURSOR_RIGHT_SEQUENCE.match(seq) - if right: - current_col += int(right.group(1) or 1) - else: - left = CURSOR_LEFT_SEQUENCE.match(seq) - if left: - current_col = max(0, current_col - int(left.group(1) or 1)) - idx = match.end() - else: - idx += 1 - max_extent = max(max_extent, current_col) - continue - - # 2. Handle illegal and vertical control characters (zero width, error in strict) - if char in ILLEGAL_CTRL: - if strict: - raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}") - idx += 1 - continue - - if char in VERTICAL_CTRL: - if strict: - raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}") - idx += 1 - continue - - # 3. Handle horizontal movement characters - if char in HORIZONTAL_CTRL: - if char == '\x09' and tabsize > 0: # Tab - current_col += tabsize - (current_col % tabsize) - elif char == '\x08': # Backspace - if current_col > 0: - current_col -= 1 - elif char == '\x0d': # Carriage return - current_col = 0 - max_extent = max(max_extent, current_col) - idx += 1 - continue - - # 4. Handle ZWJ - if char == '\u200D': - if last_was_virama: - # ZWJ after virama requests explicit half-form rendering but - # does not change cell count — consume ZWJ only, let the next - # consonant be handled by the virama conjunct rule. - idx += 1 - elif idx + 1 < text_len: - # Emoji ZWJ: skip next character unconditionally. - idx += 2 - last_was_virama = False - else: - idx += 1 - last_was_virama = False - continue - - # 5. Handle other zero-width characters (control chars) - if char in ZERO_WIDTH_CTRL: - idx += 1 - continue - - ucs = ord(char) - - # 6. Handle VS16: converts preceding narrow character to wide - if ucs == 0xFE0F: - if last_measured_idx == idx - 1: - if _bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE["9.0.0"]): - current_col += 1 - max_extent = max(max_extent, current_col) - # VS16 preserves emoji context: last_measured_ucs stays as the base - idx += 1 - continue - - # 6b. Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+) - if ucs > 0xFFFF: - if ucs in _REGIONAL_INDICATOR_SET: - # Lazy RI pairing: count preceding consecutive RIs - ri_before = 0 - j = idx - 1 - while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET: - ri_before += 1 - j -= 1 - if ri_before % 2 == 1: - last_measured_ucs = ucs - idx += 1 - continue - # 6c. Fitzpatrick modifier: zero-width when following emoji base - elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] - and last_measured_ucs in _EMOJI_ZWJ_SET): - idx += 1 - continue - - # 7. Virama conjunct formation: consonant following virama contributes 0 width. - # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category - if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE): - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - conjunct_pending = True - idx += 1 - continue - - # 8. Normal characters: measure with wcwidth - w = _wcwidth(char) - if w > 0: - if conjunct_pending: - current_col += 1 - conjunct_pending = False - current_col += w - max_extent = max(max_extent, current_col) - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE): - # Spacing Combining Mark (Mc) following a base character adds 1 - current_col += 1 - max_extent = max(max_extent, current_col) - last_measured_idx = -2 - last_was_virama = False - conjunct_pending = False - else: - last_was_virama = ucs in _ISC_VIRAMA_SET - idx += 1 - - if conjunct_pending: - current_col += 1 - max_extent = max(max_extent, current_col) - return max_extent - - def ljust( text: str, dest_width: int, @@ -853,38 +290,6 @@ def center( return fillchar * left_pad + text + fillchar * right_pad -def strip_sequences(text: str) -> str: - r""" - Return text with all terminal escape sequences removed. - - For sequences containing printable text, OSC 66 (Text sizing protocol) and OSC 8 (hyperlink), - the inner text is preserved. - - Unknown or incomplete ESC sequences are preserved. - - :param text: String that may contain terminal escape sequences. - :returns: The input text with all escape sequences stripped. - - .. versionadded:: 0.3.0 - - Example:: - - >>> strip_sequences('\x1b[31mred\x1b[0m') - 'red' - >>> strip_sequences('hello') - 'hello' - >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text') - 'bold red text' - >>> strip_sequences('\x1b]66;s=2;hello\x07') - 'hello' - >>> strip_sequences('\x1b]8;id=34;https://example.com\x1b\\[view]\x1b]8;;\x1b\\') - '[view]' - """ - if '\x1b]66;' in text: - text = TEXT_SIZING_PATTERN.sub(r'\2', text) - return ZERO_WIDTH_PATTERN.sub('', text) - - def clip( text: str, start: int, @@ -963,9 +368,12 @@ def clip( propagate_sgr = False # SGR tracking state (only when propagate_sgr=True) - sgr_at_clip_start = None # state when first visible char emitted (None = not yet) + # sgr_at_clip_start is sgr state when first visible char emitted (None = not yet) + sgr_at_clip_start = None + # current active sgr state + sgr = None # current SGR state, updated by matches of _SGR_PATTERN if propagate_sgr: - sgr = _SGR_STATE_DEFAULT # current SGR state, updated by all sequences + sgr = _SGR_STATE_DEFAULT output: list[str] = [] col = 0 @@ -1006,7 +414,7 @@ def clip( if (match := ZERO_WIDTH_PATTERN.match(text, idx)): seq = match.group() - if propagate_sgr and _SGR_PATTERN.match(seq): + if (propagate_sgr and sgr) and _SGR_PATTERN.match(seq): # Update SGR state; will be applied as prefix when visible content starts sgr = _sgr_state_update(sgr, seq) else: From 309deb29c746adc38d02db59d6c5ff8eb45b8256 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Mon, 27 Apr 2026 18:38:43 -0400 Subject: [PATCH 10/70] big bucks, no whammies --- tests/test_benchmarks.py | 5 ++--- wcwidth/_wcswidth.py | 1 - wcwidth/escape_sequences.py | 1 + 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index e642a17..a27653c 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -3,7 +3,6 @@ import os import sys import unicodedata -import platform # 3rd party import pytest @@ -375,8 +374,8 @@ def test_width_brahmic_bengali(benchmark): ) _py38_skip_pedantic = pytest.mark.skipif( - sys.version_info[:2] < (3, 9), - reason=f'benchmark.pedantic() not supported in python 3.8 or earlier') + sys.version_info[:2] < (3, 9), + reason='benchmark.pedantic() not supported in python 3.8 or earlier') @_udhr_skip diff --git a/wcwidth/_wcswidth.py b/wcwidth/_wcswidth.py index 82a056a..91d2285 100644 --- a/wcwidth/_wcswidth.py +++ b/wcwidth/_wcswidth.py @@ -1,5 +1,4 @@ """This is a python implementation of wcswidth().""" -# std imports import typing # local diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index 9bea12d..ba51d1c 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -7,6 +7,7 @@ """ # std imports import re + import typing # Text Sizing Protocol (OSC 66) — has positive width, must be checked before ZERO_WIDTH_PATTERN. From d4593209580c493dba9ced3b6ccacf7e31dcc579 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Mon, 27 Apr 2026 18:47:51 -0400 Subject: [PATCH 11/70] try .. --- docs/intro.rst | 3 ++- requirements-tests39.in | 1 + requirements-tests39.txt | 3 ++- wcwidth/text_sizing.py | 1 + 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/intro.rst b/docs/intro.rst index e80b021..a4ac498 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -469,7 +469,7 @@ History ======= 0.6.1 *2026-04-26* - * **New** `width()` now supports `Kitty Text Sizing Protocol`_ (OSC 66). + * **New** `width()` now supports `kitty text sizing protocol`_ (OSC 66). 0.6.0 *2026-02-06* * **New** Parameters ``expand_tabs``, ``replace_whitespace``, ``fix_sentence_endings``, @@ -754,6 +754,7 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c:: .. _`Unicode Standard Annex #29`: https://www.unicode.org/reports/tr29/ .. _`Terminal.detect_ambiguous_width()`: https://blessed.readthedocs.io/en/latest/api/terminal.html#blessed.terminal.Terminal.detect_ambiguous_width .. _`parity padding`: https://jazcap53.github.io/pythons-eccentric-strcenter.html +.. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ .. |pypi_downloads| image:: https://img.shields.io/pypi/dm/wcwidth.svg?logo=pypi :alt: Downloads :target: https://pypi.org/project/wcwidth/ diff --git a/requirements-tests39.in b/requirements-tests39.in index a8bf293..14cdc5b 100644 --- a/requirements-tests39.in +++ b/requirements-tests39.in @@ -7,3 +7,4 @@ packaging<26.0 tomli<2.3.0 cffi<2 pytest-benchmark +pygments<2.20 diff --git a/requirements-tests39.txt b/requirements-tests39.txt index 18fc9df..6f1c925 100644 --- a/requirements-tests39.txt +++ b/requirements-tests39.txt @@ -34,8 +34,9 @@ py-cpuinfo==9.0.0 # via pytest-benchmark pycparser==2.23 # via cffi -pygments==2.20.0 +pygments==2.19.2 # via + # -r requirements-tests39.in # pytest # rich pytest==8.4.2 diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py index 9427a2c..45be03a 100644 --- a/wcwidth/text_sizing.py +++ b/wcwidth/text_sizing.py @@ -22,6 +22,7 @@ Numerator, denominator, and alignment codes and values are parsed but otherwise ignored and have no effect on measurements made in this library. + .. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ .. versionadded:: 0.6.1 From 9f061ea6a8e81c9e59d1ae8145cdff221c37dc66 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Mon, 27 Apr 2026 18:51:21 -0400 Subject: [PATCH 12/70] go ahead, have it all Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- wcwidth/_constants.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/wcwidth/_constants.py b/wcwidth/_constants.py index f414a97..2b4a87b 100644 --- a/wcwidth/_constants.py +++ b/wcwidth/_constants.py @@ -7,6 +7,18 @@ from .table_ambiguous import AMBIGUOUS_EASTASIAN from .unicode_versions import list_versions +__all__ = ( + "_REGIONAL_INDICATOR_SET", + "_ISC_VIRAMA_SET", + "_LATEST_VERSION", + "_CATEGORY_MC_TABLE", + "_EMOJI_ZWJ_SET", + "_FITZPATRICK_RANGE", + "_ZERO_WIDTH_TABLE", + "_WIDE_EASTASIAN_TABLE", + "_AMBIGUOUS_TABLE", +) + _REGIONAL_INDICATOR_SET = frozenset( range(GRAPHEME_REGIONAL_INDICATOR[0][0], GRAPHEME_REGIONAL_INDICATOR[0][1] + 1) ) From 8b41747cde4d558d9bdfef57d7e625686323d7b7 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Tue, 28 Apr 2026 08:16:46 -0400 Subject: [PATCH 13/70] small changes --- docs/specs.rst | 8 +++---- tests/test_core.py | 6 +---- wcwidth/table_grapheme.py | 48 +++++++++++++++++++-------------------- wcwidth/table_mc.py | 8 +++---- wcwidth/table_wide.py | 20 ++++++++-------- wcwidth/table_zero.py | 18 +++++++-------- 6 files changed, 52 insertions(+), 56 deletions(-) diff --git a/docs/specs.rst b/docs/specs.rst index 6a144a4..852f081 100644 --- a/docs/specs.rst +++ b/docs/specs.rst @@ -7,7 +7,7 @@ Specification This document defines how this Python wcwidth library measures the printable width of characters of a string. This is not meant to an official standard, but as a terse description of the lowest level API functions :func:`wcwidth.wcwidth` and :func:`wcwidth.wcswidth` and its relation to higher level -API function :func:`wcwidth.wcwidth`. +functions :func:`wcwidth.width` and :func:`wcwidth.iter_graphemes`. Scope ----- @@ -18,10 +18,10 @@ accepting default arguments, ``control_codes='parse'`` and its behavior and opti its docstring and specifications of related control codes, `XTerm Control Sequences`_ and `Kitty Text Sizing Protocol`_. -:func:`wcwidth.iter_graphemes` is specified by `Unicode Standard Annex #29`_ and each string yielded -by :func:`wcwidth.iter_graphemes` may be mapped to :func:`wcwidth.wcswidth`. Although it matches +Each string yielded by :func:`wcwidth.iter_graphemes` may be mapped to :func:`wcwidth.wcswidth` to +accurately measure the width of a **grapheme**. Although :func:`wcwidth.iter_graphemes` matches behavior of Python 3.15 `uncodedata.iter_graphemes()`_ it differs in its return value, -:func:`wcwidth.iter_graphemes` yields only strings, while :func:`wcwidth.iter_graphemes` yields +:func:`wcwidth.iter_graphemes` yields only strings, while :func:`unicodedata.iter_graphemes` yields ``unicodedata.Segment`` class objects. Width of -1 diff --git a/tests/test_core.py b/tests/test_core.py index ba7b32e..13f6166 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -415,11 +415,7 @@ def test_bengali_nukta_mc(): @pytest.mark.parametrize("repeat", [1, _WIDTH_FAST_PATH_MIN_LEN]) def test_mc_width_consistency(repeat): - """ - Check width() vs. - - wcswidth() consistency - """ + """Check width() vs. wcswidth() consistency.""" # repeat value 'WIDTH_FAST_PATH_MIN_LEN' ensures both "fast" and "slow" paths are taken phrases = [ "\u0915\u094D\u0937\u093F", diff --git a/wcwidth/table_grapheme.py b/wcwidth/table_grapheme.py index 3d8c7d3..42fd19e 100644 --- a/wcwidth/table_grapheme.py +++ b/wcwidth/table_grapheme.py @@ -4,7 +4,7 @@ This module provides lookup tables for Unicode grapheme cluster break properties as defined in UAX #29: Unicode Text Segmentation. -This code generated by wcwidth/bin/update-tables.py on 2026-03-29 04:41:09 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-01-29 23:33:42 UTC. """ # pylint: disable=duplicate-code @@ -202,8 +202,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) - (0x01ae0, 0x01aeb,), # (nil) + (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B + (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b3d,), # Balinese Sign Rerekan ..Balinese Vowel Sign La L (0x01b42, 0x01b44,), # Balinese Vowel Sign Pepe..Balinese Adeg Adeg @@ -284,7 +284,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11001, 0x11001,), # Brahmi Sign Anusvara @@ -367,9 +367,9 @@ (0x11a59, 0x11a5b,), # Soyombo Vowel Sign Vocal..Soyombo Vowel Length Mar (0x11a8a, 0x11a96,), # Soyombo Final Consonant ..Soyombo Sign Anusvara (0x11a98, 0x11a99,), # Soyombo Gemination Mark ..Soyombo Subjoiner - (0x11b60, 0x11b60,), # (nil) - (0x11b62, 0x11b64,), # (nil) - (0x11b66, 0x11b66,), # (nil) + (0x11b60, 0x11b60,), # Sharada Vowel Sign Oe + (0x11b62, 0x11b64,), # Sharada Vowel Sign Ue ..Sharada Vowel Sign Short + (0x11b66, 0x11b66,), # Sharada Vowel Sign Candra E (0x11c30, 0x11c36,), # Bhaiksuki Vowel Sign I ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3d,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Anusvara (0x11c3f, 0x11c3f,), # Bhaiksuki Sign Virama @@ -426,10 +426,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # (nil) - (0x1e6e6, 0x1e6e6,), # (nil) - (0x1e6ee, 0x1e6ef,), # (nil) - (0x1e6f5, 0x1e6f5,), # (nil) + (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue + (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au + (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang + (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri @@ -617,9 +617,9 @@ (0x11a39, 0x11a39,), # Zanabazar Square Sign Visarga (0x11a57, 0x11a58,), # Soyombo Vowel Sign Ai ..Soyombo Vowel Sign Au (0x11a97, 0x11a97,), # Soyombo Sign Visarga - (0x11b61, 0x11b61,), # (nil) - (0x11b65, 0x11b65,), # (nil) - (0x11b67, 0x11b67,), # (nil) + (0x11b61, 0x11b61,), # Sharada Vowel Sign Ooe + (0x11b65, 0x11b65,), # Sharada Vowel Sign Short O + (0x11b67, 0x11b67,), # Sharada Vowel Sign Candra O (0x11c2f, 0x11c2f,), # Bhaiksuki Vowel Sign Aa (0x11c3e, 0x11c3e,), # Bhaiksuki Sign Visarga (0x11ca9, 0x11ca9,), # Marchen Subjoined Letter Ya @@ -1892,8 +1892,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) - (0x01ae0, 0x01aeb,), # (nil) + (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B + (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b3d,), # Balinese Sign Rerekan ..Balinese Vowel Sign La L (0x01b42, 0x01b43,), # Balinese Vowel Sign Pepe..Balinese Vowel Sign Pepe @@ -1972,7 +1972,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11001, 0x11001,), # Brahmi Sign Anusvara @@ -2055,9 +2055,9 @@ (0x11a59, 0x11a5b,), # Soyombo Vowel Sign Vocal..Soyombo Vowel Length Mar (0x11a8a, 0x11a96,), # Soyombo Final Consonant ..Soyombo Sign Anusvara (0x11a98, 0x11a98,), # Soyombo Gemination Mark - (0x11b60, 0x11b60,), # (nil) - (0x11b62, 0x11b64,), # (nil) - (0x11b66, 0x11b66,), # (nil) + (0x11b60, 0x11b60,), # Sharada Vowel Sign Oe + (0x11b62, 0x11b64,), # Sharada Vowel Sign Ue ..Sharada Vowel Sign Short + (0x11b66, 0x11b66,), # Sharada Vowel Sign Candra E (0x11c30, 0x11c36,), # Bhaiksuki Vowel Sign I ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3d,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Anusvara (0x11c3f, 0x11c3f,), # Bhaiksuki Sign Virama @@ -2114,10 +2114,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # (nil) - (0x1e6e6, 0x1e6e6,), # (nil) - (0x1e6ee, 0x1e6ef,), # (nil) - (0x1e6f5, 0x1e6f5,), # (nil) + (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue + (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au + (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang + (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri diff --git a/wcwidth/table_mc.py b/wcwidth/table_mc.py index 59cce63..7c2e691 100644 --- a/wcwidth/table_mc.py +++ b/wcwidth/table_mc.py @@ -1,7 +1,7 @@ """ Exports CATEGORY_MC table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-03-29 04:41:09 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-01-29 00:47:54 UTC. """ # pylint: disable=duplicate-code CATEGORY_MC = { @@ -181,9 +181,9 @@ (0x11a39, 0x11a39,), # Zanabazar Square Sign Visarga (0x11a57, 0x11a58,), # Soyombo Vowel Sign Ai ..Soyombo Vowel Sign Au (0x11a97, 0x11a97,), # Soyombo Sign Visarga - (0x11b61, 0x11b61,), # (nil) - (0x11b65, 0x11b65,), # (nil) - (0x11b67, 0x11b67,), # (nil) + (0x11b61, 0x11b61,), # Sharada Vowel Sign Ooe + (0x11b65, 0x11b65,), # Sharada Vowel Sign Short O + (0x11b67, 0x11b67,), # Sharada Vowel Sign Candra O (0x11c2f, 0x11c2f,), # Bhaiksuki Vowel Sign Aa (0x11c3e, 0x11c3e,), # Bhaiksuki Sign Visarga (0x11ca9, 0x11ca9,), # Marchen Subjoined Letter Ya diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py index 0f0385e..ed6f48a 100644 --- a/wcwidth/table_wide.py +++ b/wcwidth/table_wide.py @@ -1,7 +1,7 @@ """ Exports WIDE_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-03-29 04:41:09 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:58:17 UTC. """ # pylint: disable=duplicate-code WIDE_EASTASIAN = { @@ -71,10 +71,10 @@ (0x0ff01, 0x0ff60,), # Fullwidth Exclamation Ma..Fullwidth Right White Pa (0x0ffe0, 0x0ffe6,), # Fullwidth Cent Sign ..Fullwidth Won Sign (0x16fe0, 0x16fe3,), # Tangut Iteration Mark ..Old Chinese Iteration Ma - (0x16ff2, 0x16ff6,), # (nil) + (0x16ff2, 0x16ff6,), # Chinese Small Simplified..Yangqin Sign Slow Two Be (0x17000, 0x18cd5,), # (nil) ..Khitan Small Script Char (0x18cff, 0x18d1e,), # Khitan Small Script Char..(nil) - (0x18d80, 0x18df2,), # (nil) + (0x18d80, 0x18df2,), # Tangut Component-769 ..Tangut Component-883 (0x1aff0, 0x1aff3,), # Katakana Letter Minnan T..Katakana Letter Minnan T (0x1aff5, 0x1affb,), # Katakana Letter Minnan T..Katakana Letter Minnan N (0x1affd, 0x1affe,), # Katakana Letter Minnan N..Katakana Letter Minnan N @@ -116,7 +116,7 @@ (0x1f680, 0x1f6c5,), # Rocket ..Left Luggage (0x1f6cc, 0x1f6cc,), # Sleeping Accommodation (0x1f6d0, 0x1f6d2,), # Place Of Worship ..Shopping Trolley - (0x1f6d5, 0x1f6d8,), # Hindu Temple ..(nil) + (0x1f6d5, 0x1f6d8,), # Hindu Temple ..Landslide (0x1f6dc, 0x1f6df,), # Wireless ..Ring Buoy (0x1f6eb, 0x1f6ec,), # Airplane Departure ..Airplane Arriving (0x1f6f4, 0x1f6fc,), # Scooter ..Roller Skate @@ -126,12 +126,12 @@ (0x1f93c, 0x1f945,), # Wrestlers ..Goal Net (0x1f947, 0x1f9ff,), # First Place Medal ..Nazar Amulet (0x1fa70, 0x1fa7c,), # Ballet Shoes ..Crutch - (0x1fa80, 0x1fa8a,), # Yo-yo ..(nil) - (0x1fa8e, 0x1fac6,), # (nil) ..Fingerprint - (0x1fac8, 0x1fac8,), # (nil) - (0x1facd, 0x1fadc,), # (nil) ..Root Vegetable - (0x1fadf, 0x1faea,), # Splatter ..(nil) - (0x1faef, 0x1faf8,), # (nil) ..Rightwards Pushing Hand + (0x1fa80, 0x1fa8a,), # Yo-yo ..Trombone + (0x1fa8e, 0x1fac6,), # Treasure Chest ..Fingerprint + (0x1fac8, 0x1fac8,), # Hairy Creature + (0x1facd, 0x1fadc,), # Orca ..Root Vegetable + (0x1fadf, 0x1faea,), # Splatter ..Distorted Face + (0x1faef, 0x1faf8,), # Fight Cloud ..Rightwards Pushing Hand (0x20000, 0x2fffd,), # Cjk Unified Ideograph-20..(nil) (0x30000, 0x3fffd,), # Cjk Unified Ideograph-30..(nil) ), diff --git a/wcwidth/table_zero.py b/wcwidth/table_zero.py index cb4bdba..c440bfc 100644 --- a/wcwidth/table_zero.py +++ b/wcwidth/table_zero.py @@ -1,7 +1,7 @@ """ Exports ZERO_WIDTH table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-03-29 04:41:09 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:48:24 UTC. """ # pylint: disable=duplicate-code ZERO_WIDTH = { @@ -147,8 +147,8 @@ (0x01a55, 0x01a5e,), # Tai Tham Consonant Sign ..Tai Tham Consonant Sign (0x01a60, 0x01a7c,), # Tai Tham Sign Sakot ..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) - (0x01ae0, 0x01aeb,), # (nil) + (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B + (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa (0x01b00, 0x01b04,), # Balinese Sign Ulu Ricem ..Balinese Sign Bisah (0x01b34, 0x01b44,), # Balinese Sign Rerekan ..Balinese Adeg Adeg (0x01b6b, 0x01b73,), # Balinese Musical Symbol ..Balinese Musical Symbol @@ -222,7 +222,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11000, 0x11002,), # Brahmi Sign Candrabindu ..Brahmi Sign Visarga @@ -284,7 +284,7 @@ (0x11a47, 0x11a47,), # Zanabazar Square Subjoiner (0x11a51, 0x11a5b,), # Soyombo Vowel Sign I ..Soyombo Vowel Length Mar (0x11a8a, 0x11a99,), # Soyombo Final Consonant ..Soyombo Subjoiner - (0x11b60, 0x11b67,), # (nil) + (0x11b60, 0x11b67,), # Sharada Vowel Sign Oe ..Sharada Vowel Sign Candr (0x11c2f, 0x11c36,), # Bhaiksuki Vowel Sign Aa ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3f,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Virama (0x11c92, 0x11ca7,), # Marchen Subjoined Letter..Marchen Subjoined Letter @@ -339,10 +339,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # (nil) - (0x1e6e6, 0x1e6e6,), # (nil) - (0x1e6ee, 0x1e6ef,), # (nil) - (0x1e6f5, 0x1e6f5,), # (nil) + (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue + (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au + (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang + (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0xe0000, 0xe0fff,), # (nil) From af4cab897a4f57312e4cb803bf9652b343a7510f Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Tue, 28 Apr 2026 08:29:07 -0400 Subject: [PATCH 14/70] tempted to dump pypy testing .. --- requirements-tests39.in | 1 + requirements-tests39.txt | 6 ++++-- tests/test_core.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/requirements-tests39.in b/requirements-tests39.in index 14cdc5b..a2167f2 100644 --- a/requirements-tests39.in +++ b/requirements-tests39.in @@ -8,3 +8,4 @@ tomli<2.3.0 cffi<2 pytest-benchmark pygments<2.20 +zipp<3.23.1 diff --git a/requirements-tests39.txt b/requirements-tests39.txt index 6f1c925..1b92757 100644 --- a/requirements-tests39.txt +++ b/requirements-tests39.txt @@ -60,5 +60,7 @@ tomli==2.2.1 # pytest typing-extensions==4.15.0 # via exceptiongroup -zipp==3.23.1 - # via importlib-metadata +zipp==3.23.0 + # via + # -r requirements-tests39.in + # importlib-metadata diff --git a/tests/test_core.py b/tests/test_core.py index 13f6166..cf71d31 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -415,7 +415,7 @@ def test_bengali_nukta_mc(): @pytest.mark.parametrize("repeat", [1, _WIDTH_FAST_PATH_MIN_LEN]) def test_mc_width_consistency(repeat): - """Check width() vs. wcswidth() consistency.""" + """Check width() to wcswidth() consistency.""" # repeat value 'WIDTH_FAST_PATH_MIN_LEN' ensures both "fast" and "slow" paths are taken phrases = [ "\u0915\u094D\u0937\u093F", From 8e7dade82e1244bab7d86a891ac24f9e7f257d47 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Tue, 28 Apr 2026 08:36:51 -0400 Subject: [PATCH 15/70] That's it. Disable pypy. Fuck Microsoft. --- .github/workflows/ci.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b51e64f..a6fd35a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,10 +72,16 @@ jobs: - "3.12" - "3.13" - "3.14" - - "pypy-3.8" - - "pypy-3.9" - - "pypy-3.10" - - "pypy-3.11" + # jquast: pypy disabled 4/28/26 when installing requirements-tests39.txt, "ERROR: Could not + # install packages due to an OSError: ('Connection broken: IncompleteRead(14094 bytes + # read, 554 more expected)', Occur *only* with pypy versions 3.10 and 3.11, and *only* on + # the Windows platform. Microsoft's is an international crime ring designed to keep + # mediocre middle-aged men employed doing barely any work at all, tracking and working + # around bullshit just like this, I'm not paid and I won't be played. + # - "pypy-3.8" + # - "pypy-3.9" + # - "pypy-3.10" + # - "pypy-3.11" runs-on: ${{ matrix.os }} container: ${{ matrix.container }} From fe98de8c373b796fa0513d478fa0c7ee65611a4b Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Tue, 28 Apr 2026 08:46:21 -0400 Subject: [PATCH 16/70] small refactor and remove local import in textwrap --- tests/test_textwrap.py | 96 +++++++++++++----------------------------- wcwidth/textwrap.py | 4 +- 2 files changed, 30 insertions(+), 70 deletions(-) diff --git a/tests/test_textwrap.py b/tests/test_textwrap.py index 094c8e5..f1a1397 100644 --- a/tests/test_textwrap.py +++ b/tests/test_textwrap.py @@ -76,17 +76,14 @@ def _colorize(text): ) -EDGE_CASES = [ +@pytest.mark.parametrize('text,w,expected', [ ('', 10, []), (' ', 10, []), ('\u5973', 0, ['\u5973']), ('\u5973', 1, ['\u5973']), (ZWJ_FAMILY, 1, [ZWJ_FAMILY]), (HANGUL_GA, 1, [HANGUL_GA]), -] - - -@pytest.mark.parametrize('text,w,expected', EDGE_CASES) +]) def test_wrap_edge_cases(text, w, expected): assert wrap(text, w) == expected @@ -95,28 +92,22 @@ def test_wrap_initial_indent(): assert wrap('hello world', 10, initial_indent='> ') == ['> hello', 'world'] -LONG_WORD_CASES = [ +@pytest.mark.parametrize('text,w,break_long,expected', [ ('abcdefghij', 3, True, ['abc', 'def', 'ghi', 'j']), ('abcdefghij', 3, False, ['abcdefghij']), -] - - -@pytest.mark.parametrize('text,w,break_long,expected', LONG_WORD_CASES) +]) def test_wrap_long_words(text, w, break_long, expected): assert wrap(text, w, break_long_words=break_long) == expected -HYPHEN_LONG_WORD_CASES = [ +@pytest.mark.parametrize('text,w,break_hyphens,propagate,expected', [ ('a-b-c-d', 3, True, True, ['a-', 'b-', 'c-d']), ('a-b-c-d', 3, False, True, ['a-b', '-c-', 'd']), ('---', 2, True, True, ['--', '-']), ('a---b', 2, True, True, ['a-', '--', 'b']), ('a-\x1b[31mb', 2, True, True, ['a-\x1b[31m\x1b[0m', '\x1b[31mb\x1b[0m']), ('a-\x1b[31mb', 2, True, False, ['a-\x1b[31m', 'b']), -] - - -@pytest.mark.parametrize('text,w,break_hyphens,propagate,expected', HYPHEN_LONG_WORD_CASES) +]) def test_wrap_hyphen_long_words(text, w, break_hyphens, propagate, expected): assert wrap(text, w, break_on_hyphens=break_hyphens, propagate_sgr=propagate) == expected @@ -182,7 +173,7 @@ def test_wrap_multiline_matches_stdlib(): assert wrap(given, 30) == textwrap.wrap(given, 30) -UNICODE_CASES = [ +@pytest.mark.parametrize('text,w,expected', [ # CJK (2 cells each) ('\u4e2d\u6587\u5b57\u7b26', 4, ['\u4e2d\u6587', '\u5b57\u7b26']), ('\u4e2d\u6587\u5b57', 5, ['\u4e2d\u6587', '\u5b57']), @@ -192,18 +183,14 @@ def test_wrap_multiline_matches_stdlib(): (f'{FAMILY_ZWJ} ab', 4, [FAMILY_ZWJ, 'ab']), (f'{SMILEY_VS16} ab', 3, [SMILEY_VS16, 'ab']), ('\U0001F469\U0001F467\U0001F466', 4, ['\U0001F469\U0001F467', '\U0001F466']), -] - - -@pytest.mark.parametrize('text,w,expected', UNICODE_CASES) +]) def test_wrap_unicode(benchmark, text, w, expected): kwargs = {'break_on_hyphens': False} if '-' in text else {} result = benchmark(wrap, text, w, **kwargs) assert result == expected -# Escape sequence preservation (with propagate_sgr=True default) -SEQUENCE_CASES = [ +@pytest.mark.parametrize('text,w,expected', [ # SGR sequences propagated across lines (f'{SGR_RED}red{SGR_RESET} blue', 4, [f'{SGR_RED}red{SGR_RESET}', 'blue']), # SGR at end of line propagates to next line @@ -223,41 +210,34 @@ def test_wrap_unicode(benchmark, text, w, expected): ['x\x1b[31mab\x1b[0m', '\x1b[31mcde\x1b[0m', '\x1b[31mfgh\x1b[0m', '\x1b[31mij\x1b[0m']), # Lone ESC - not a valid SGR sequence, stays with preceding text ('abc\x1bdefghij', 3, ['abc\x1b', 'def', 'ghi', 'j']), -] - -SEQUENCE_CASES_NO_PROPAGATE = [ - (f'hello{SGR_RED} world', 6, [f'hello{SGR_RED}', 'world']), - ('x\x1b[31mabcdefghij\x1b[0m', 3, ['x\x1b[31mab', 'cde', 'fgh', 'ij\x1b[0m']), -] - - -@pytest.mark.parametrize('text,w,expected', SEQUENCE_CASES) +]) def test_wrap_sequences(benchmark, text, w, expected): + """Escape sequence preservation (with propagate_sgr=True default)""" assert benchmark(wrap, text, w) == expected -@pytest.mark.parametrize('text,w,expected', SEQUENCE_CASES_NO_PROPAGATE) +@pytest.mark.parametrize('text,w,expected', [ + (f'hello{SGR_RED} world', 6, [f'hello{SGR_RED}', 'world']), + ('x\x1b[31mabcdefghij\x1b[0m', 3, ['x\x1b[31mab', 'cde', 'fgh', 'ij\x1b[0m']), +] +) def test_wrap_sequences_no_propagate(text, w, expected): result = wrap(text, w, propagate_sgr=False) assert result == expected -# Mixed: sequences + unicode -MIXED_CASES = [ +@pytest.mark.parametrize('text,w,expected', [ (f'{SGR_RED}\u4e2d\u6587{SGR_RESET} ab', 5, [f'{SGR_RED}\u4e2d\u6587{SGR_RESET}', 'ab']), (f'{SGR_RED}{FAMILY_ZWJ}{SGR_RESET} ab', 4, [f'{SGR_RED}{FAMILY_ZWJ}{SGR_RESET}', 'ab']), (f'{SGR_BOLD}\u4e2d{SGR_RESET}y z', 4, [f'{SGR_BOLD}\u4e2d{SGR_RESET}y', 'z']), -] - - -@pytest.mark.parametrize('text,w,expected', MIXED_CASES) +]) def test_wrap_mixed(benchmark, text, w, expected): + """Test mixed sequences + unicode.""" result = benchmark(wrap, text, w) assert result == expected -# Tabsize with wide characters - tests column alignment with different cell widths -TABSIZE_WIDE_CASES = [ +@pytest.mark.parametrize('text,w,tabsize,expected', [ # CJK (2 cells) + tab: tabsize=4, '\u4e2d' is 2 cols, tab expands to col 4 ('\u4e2d\ta b', 6, 4, ['\u4e2d a', 'b']), # CJK + tab with tabsize=8: '\u4e2d' is 2 cols, tab expands to col 8 @@ -268,10 +248,7 @@ def test_wrap_mixed(benchmark, text, w, expected): ('\u4e2d\u6587\ta', 8, 4, ['\u4e2d\u6587 a']), # ASCII + tab + CJK: 'a' is 1 col, tab to 4 (3 spaces), CJK is 2 cols ('a\t\u4e2d b', 8, 4, ['a \u4e2d b']), -] - - -@pytest.mark.parametrize('text,w,tabsize,expected', TABSIZE_WIDE_CASES) +]) @pytest.mark.skipif( platform.python_implementation() == 'PyPy' and sys.version_info < (3, 9), reason='PyPy 3.8 str.expandtabs() counts UTF-8 bytes instead of characters' @@ -286,7 +263,7 @@ def test_wrap_tabsize_wide_chars(text, w, tabsize, expected): OSC_START_BEL = '\x1b]8;;http://example.com\x07' OSC_END_BEL = '\x1b]8;;\x07' -HYPERLINK_WORD_BOUNDARY_CASES = [ +@pytest.mark.parametrize('text,w,expected', [ ( # standard, ST-variant, f'{OSC_START_ST}link{OSC_END_ST}more', 5, @@ -408,18 +385,14 @@ def test_wrap_tabsize_wide_chars(text, w, tabsize, expected): '\x1b]8;foo=bar:id=mylink;http://example.com\x1b\\Click\x1b]8;;\x1b\\', '\x1b]8;foo=bar:id=mylink;http://example.com\x1b\\here\x1b]8;;\x1b\\', ], - ), -] - - -@pytest.mark.parametrize('text,w,expected', HYPERLINK_WORD_BOUNDARY_CASES) + ),]) def test_wrap_hyperlink_word_boundary(text, w, expected): """OSC hyperlink sequences should act as word boundaries.""" result = wrap(text, w) assert result == expected -PLACEHOLDER_STDLIB_CASES = [ +@pytest.mark.parametrize('text,kwargs', [ ('The quick brown fox jumps over the lazy dog', {'width': 10, 'max_lines': 3, 'placeholder': '...'}), ('1234567890 1234567890 extra', @@ -444,10 +417,7 @@ def test_wrap_hyperlink_word_boundary(text, w, expected): {'width': 10, 'subsequent_indent': ' ', 'max_lines': 2, 'placeholder': '...'}), ('hello world foo bar', {'width': 10, 'initial_indent': '> ', 'max_lines': 2, 'placeholder': '...'}), -] - - -@pytest.mark.parametrize('text,kwargs', PLACEHOLDER_STDLIB_CASES) + ]) def test_wrap_max_lines_matches_stdlib(text, kwargs): expected = _adjust_stdlib_result(textwrap.wrap(text, **kwargs), kwargs) assert wrap(text, **kwargs) == expected @@ -460,7 +430,7 @@ def test_wrap_placeholder_too_large(): textwrap.wrap('fox', width=1, max_lines=3, placeholder='...') -MAX_LINES_SEQUENCE_CASES = [ +@pytest.mark.parametrize('text,w,ml,ph,expected', [ (f'{SGR_RED}hello world foo bar{SGR_RESET}', 8, 2, '...', [f'{SGR_RED}hello{SGR_RESET}', f'{SGR_RED}world...{SGR_RESET}']), (f'{SGR_RED}hello{SGR_RESET} world foo', @@ -470,10 +440,7 @@ def test_wrap_placeholder_too_large(): ('\u4e2d\u6587 \u5b57\u7b26 hello', 5, 1, '~', ['\u4e2d\u6587~']), ('\u4e2d\u6587 \u5b57\u7b26 hello world', 5, 2, '~', ['\u4e2d\u6587', '\u5b57\u7b26~']), ('\u4e2d\u6587\u5b57\u7b26 hello', 12, 1, '...', ['\u4e2d\u6587\u5b57\u7b26...']), -] - - -@pytest.mark.parametrize('text,w,ml,ph,expected', MAX_LINES_SEQUENCE_CASES) + ]) def test_wrap_max_lines_sequences(text, w, ml, ph, expected): assert wrap(text, w, max_lines=ml, placeholder=ph) == expected @@ -494,19 +461,14 @@ def test_wrap_max_lines_hyperlink_close_on_prev_line(): assert result == [f'{OSC_START_ST}ab{OSC_END_ST}...'] -# -- expand_tabs, replace_whitespace, fix_sentence_endings -- - -STDLIB_PARAM_CASES = [ +@pytest.mark.parametrize('text,kwargs', [ ('hello\tworld', {'width': 20, 'expand_tabs': False, 'replace_whitespace': False}), ('hello\tworld foo\tbar baz', {'width': 12, 'expand_tabs': False, 'tabsize': 8}), ('hello\nworld', {'width': 20, 'replace_whitespace': False}), ('a\t b\n c', {'width': 20, 'replace_whitespace': False}), ('Hello world. This is a test. More text.', {'width': 20, 'fix_sentence_endings': True}), ('Dr. Smith went to Washington. He left.', {'width': 20, 'fix_sentence_endings': True}), -] - - -@pytest.mark.parametrize('text,kwargs', STDLIB_PARAM_CASES) + ]) def test_wrap_stdlib_params(text, kwargs): assert wrap(text, **kwargs) == textwrap.wrap(text, **kwargs) diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index b471723..5cf069b 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -17,6 +17,7 @@ from .grapheme import iter_graphemes from .sgr_state import propagate_sgr as _propagate_sgr from .escape_sequences import ZERO_WIDTH_PATTERN, iter_sequences +from ._width import width if TYPE_CHECKING: # pragma: no cover from typing import Any, Literal @@ -97,9 +98,6 @@ def _next_hyperlink_id() -> str: def _width(self, text: str) -> int: """Measure text width accounting for sequences.""" - # pylint: disable=import-outside-toplevel - # local - from .wcwidth import width return width(text, control_codes=self.control_codes, tabsize=self.tabsize, ambiguous_width=self.ambiguous_width) From 20f1bf6af9b59516fa309be8e53f51b2d1c5252b Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Tue, 28 Apr 2026 08:47:16 -0400 Subject: [PATCH 17/70] re-order text sizing after SGR (less common/faster match) --- wcwidth/wcwidth.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 48383d0..611c387 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -388,6 +388,17 @@ def clip( # Handle escape sequences if char == '\x1b': + if (match := ZERO_WIDTH_PATTERN.match(text, idx)): + seq = match.group() + if (propagate_sgr and sgr) and _SGR_PATTERN.match(seq): + # Update SGR state; will be applied as prefix when visible content starts + sgr = _sgr_state_update(sgr, seq) + else: + # Non-SGR sequences always preserved + output.append(seq) + idx = match.end() + continue + # OSC 66 (text sizing) has positive width, handle before zero-width path if (ts_match := TEXT_SIZING_PATTERN.match(text, idx)): text_size = TextSizing.from_match(ts_match, control_codes='parse') @@ -412,16 +423,6 @@ def clip( idx = ts_match.end() continue - if (match := ZERO_WIDTH_PATTERN.match(text, idx)): - seq = match.group() - if (propagate_sgr and sgr) and _SGR_PATTERN.match(seq): - # Update SGR state; will be applied as prefix when visible content starts - sgr = _sgr_state_update(sgr, seq) - else: - # Non-SGR sequences always preserved - output.append(seq) - idx = match.end() - continue # Handle bare ESC (not a valid sequence) if char == '\x1b': From 1bc7a4364bcf4f2004efad04ce52ae5d3909fe60 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Tue, 28 Apr 2026 08:51:11 -0400 Subject: [PATCH 18/70] don't redefine width from outer scope --- tests/test_textwrap.py | 7 ++++--- wcwidth/textwrap.py | 6 +++--- wcwidth/wcwidth.py | 1 - 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_textwrap.py b/tests/test_textwrap.py index f1a1397..0144956 100644 --- a/tests/test_textwrap.py +++ b/tests/test_textwrap.py @@ -263,6 +263,7 @@ def test_wrap_tabsize_wide_chars(text, w, tabsize, expected): OSC_START_BEL = '\x1b]8;;http://example.com\x07' OSC_END_BEL = '\x1b]8;;\x07' + @pytest.mark.parametrize('text,w,expected', [ ( # standard, ST-variant, f'{OSC_START_ST}link{OSC_END_ST}more', @@ -417,7 +418,7 @@ def test_wrap_hyperlink_word_boundary(text, w, expected): {'width': 10, 'subsequent_indent': ' ', 'max_lines': 2, 'placeholder': '...'}), ('hello world foo bar', {'width': 10, 'initial_indent': '> ', 'max_lines': 2, 'placeholder': '...'}), - ]) +]) def test_wrap_max_lines_matches_stdlib(text, kwargs): expected = _adjust_stdlib_result(textwrap.wrap(text, **kwargs), kwargs) assert wrap(text, **kwargs) == expected @@ -440,7 +441,7 @@ def test_wrap_placeholder_too_large(): ('\u4e2d\u6587 \u5b57\u7b26 hello', 5, 1, '~', ['\u4e2d\u6587~']), ('\u4e2d\u6587 \u5b57\u7b26 hello world', 5, 2, '~', ['\u4e2d\u6587', '\u5b57\u7b26~']), ('\u4e2d\u6587\u5b57\u7b26 hello', 12, 1, '...', ['\u4e2d\u6587\u5b57\u7b26...']), - ]) +]) def test_wrap_max_lines_sequences(text, w, ml, ph, expected): assert wrap(text, w, max_lines=ml, placeholder=ph) == expected @@ -468,7 +469,7 @@ def test_wrap_max_lines_hyperlink_close_on_prev_line(): ('a\t b\n c', {'width': 20, 'replace_whitespace': False}), ('Hello world. This is a test. More text.', {'width': 20, 'fix_sentence_endings': True}), ('Dr. Smith went to Washington. He left.', {'width': 20, 'fix_sentence_endings': True}), - ]) +]) def test_wrap_stdlib_params(text, kwargs): assert wrap(text, **kwargs) == textwrap.wrap(text, **kwargs) diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index 5cf069b..cc3a3d7 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -14,10 +14,10 @@ from typing import TYPE_CHECKING, NamedTuple # local +from ._width import width as wcwidth_width from .grapheme import iter_graphemes from .sgr_state import propagate_sgr as _propagate_sgr from .escape_sequences import ZERO_WIDTH_PATTERN, iter_sequences -from ._width import width if TYPE_CHECKING: # pragma: no cover from typing import Any, Literal @@ -98,8 +98,8 @@ def _next_hyperlink_id() -> str: def _width(self, text: str) -> int: """Measure text width accounting for sequences.""" - return width(text, control_codes=self.control_codes, tabsize=self.tabsize, - ambiguous_width=self.ambiguous_width) + return wcwidth_width(text, control_codes=self.control_codes, tabsize=self.tabsize, + ambiguous_width=self.ambiguous_width) def _strip_sequences(self, text: str) -> str: """Strip all terminal sequences from text.""" diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 611c387..ed07397 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -423,7 +423,6 @@ def clip( idx = ts_match.end() continue - # Handle bare ESC (not a valid sequence) if char == '\x1b': output.append(char) From ba6741e3a71d42197e9dc75225d3c251e16ef22b Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Tue, 28 Apr 2026 14:23:38 -0400 Subject: [PATCH 19/70] checkpoint --- tests/test_text_sizing.py | 324 ++++++++++++++++++------------------ wcwidth/_width.py | 30 ++-- wcwidth/escape_sequences.py | 3 +- wcwidth/text_sizing.py | 30 ++-- wcwidth/wcwidth.py | 120 ++++++++----- 5 files changed, 273 insertions(+), 234 deletions(-) diff --git a/tests/test_text_sizing.py b/tests/test_text_sizing.py index eb5cbda..a9fc905 100644 --- a/tests/test_text_sizing.py +++ b/tests/test_text_sizing.py @@ -4,27 +4,32 @@ # local import wcwidth -from wcwidth.text_sizing import TextSizing, TextSizingParams +from wcwidth import TextSizing, TextSizingParams +from wcwidth.text_sizing import TEXT_FIELD_MAPPING from wcwidth.escape_sequences import TEXT_SIZING_PATTERN +_W_HI = TEXT_FIELD_MAPPING['w'].high +_S_HI = TEXT_FIELD_MAPPING['s'].high +_N_HI = TEXT_FIELD_MAPPING['n'].high +_D_HI = TEXT_FIELD_MAPPING['d'].high + CONTROL_CODES_PARAMS_CASES = [ ('x=2', "", "Unknown text sizing field 'x' in "), ('s=3:x=3', "s=3", "Unknown text sizing field 'x' in "), - ('s=2:x=3:w=9', "s=2:w=7", "Unknown text sizing field 'x' in "), + ('s=2:x=3:w=9', f"s=2:w={_W_HI}", "Unknown text sizing field 'x' in "), ('xyz=2', "", "Unknown text sizing field 'xyz' in "), ('xxx', "", "Expected '=' in text sizing parameter"), ('s=xxx', "", "Illegal text sizing value 'xxx' in "), ('s=-99', "", "Out of bounds text sizing value '-99' in "), - ('s=99', "s=7", "Out of bounds text sizing value '99' in "), + ('s=99', f"s={_W_HI}", "Out of bounds text sizing value '99' in "), ('w=-1', "", "Out of bounds text sizing value '-1' in "), - ('w=8', "w=7", "Out of bounds text sizing value '8' in "), - ('n=20', "n=15", "Out of bounds text sizing value '20' in "), - ('d=99', "d=15", "Out of bounds text sizing value '99' in "), + ('w=8', f"w={_W_HI}", "Out of bounds text sizing value '8' in "), + ('n=20', f"n={_N_HI}", "Out of bounds text sizing value '20' in "), + ('d=99', f"d={_D_HI}", "Out of bounds text sizing value '99' in "), ('v=5', "v=2", "Out of bounds text sizing value '5' in "), ('h=3', "h=2", "Out of bounds text sizing value '3' in "), ] - @pytest.mark.parametrize('given_params,expected_remainder,expected_exc,', CONTROL_CODES_PARAMS_CASES) def test_text_sizing_params_control_codes(given_params, expected_remainder, expected_exc): """Verify control_codes='strict' and 'parse' behavior in TextSizingParams.from_params().""" @@ -51,29 +56,47 @@ def test_text_sizing_width_control_codes(given_params, expected_remainder, expec @pytest.mark.parametrize('params,text,expected_width', [ # cases of static width=N values, + (TextSizingParams(scale=2, width=1), 'climclam', 2), (TextSizingParams(scale=2, width=3), 'anything', 6), (TextSizingParams(scale=1, width=5), '', 5), (TextSizingParams(scale=3, width=1), 'x', 3), # and automatic width (width=0) values, - (TextSizingParams(scale=1), 'AB', 2), + (TextSizingParams(), '', 0), + (TextSizingParams(), 'AB', 2), + (TextSizingParams(), '中', 2), (TextSizingParams(scale=2), 'AB', 4), - (TextSizingParams(scale=1), '中', 2), (TextSizingParams(scale=2), '中', 4), - (TextSizingParams(scale=1), '', 0), (TextSizingParams(scale=3), '', 0), + (TextSizingParams(scale=7, width=7, numerator=15, denominator=15, + vertical_align=2, horizontal_align=2), 'x!yzzy', 49), ]) def test_text_sizing_width(params, text, expected_width): """Verify width using with both kinds of terminator.""" + # verify internal TextSizing.display_width() result, assert TextSizing(params, text, terminator='\x07').display_width() == expected_width assert TextSizing(params, text, terminator='\x1b\\').display_width() == expected_width seq1 = TextSizing(params, text, terminator='\x07').make_sequence() seq2 = TextSizing(params, text, terminator='\x1b\\').make_sequence() + + # verify round-trip + ts_match1, ts_match2 = TEXT_SIZING_PATTERN.match(seq1), TEXT_SIZING_PATTERN.match(seq2) + assert ts_match1 and ts_match2 + assert TextSizing.from_match(ts_match1) == TextSizing(params, text, terminator='\x07') + assert TextSizing.from_match(ts_match2) == TextSizing(params, text, terminator='\x1b\\') + + # and external width(), assert wcwidth.width(seq1) == expected_width assert wcwidth.width(seq2) == expected_width + # verify 'strict' does not raise ValueError + wcwidth.width(seq1, control_codes='strict') + wcwidth.width(seq2, control_codes='strict') + + # and verify 'ignore' measures only inner_text (does not parse scale or width) + assert wcwidth.width(seq1, control_codes='ignore') == wcwidth.wcswidth(text) + assert wcwidth.width(seq2, control_codes='ignore') == wcwidth.wcswidth(text) + -# ('abc\x1b]66;w=3;x\x07def', 'x', 'w=3', 7), -# ('\x1b[31m\x1b]66;w=2;AB\x07\x1b[0m', 2), @pytest.mark.parametrize('given_sequence,expected_text,expected_params,expected_width', [ ('\x1b]66;s=2:w=2;AB\x07', 'AB', 's=2:w=2', 4), ('\x1b]66;s=2:w=2;\u4e2d\x07', '\u4e2d', 's=2:w=2', 4), @@ -89,7 +112,8 @@ def test_text_sizing_width(params, text, expected_width): ('\x1b]66;w=2;A\x07', 'A', 'w=2', 2), ('\x1b]66;s=2:w=3;text\x1b\\', 'text', 's=2:w=3', 6), ]) -def test_text_sizing_scale_width(given_sequence, expected_text, expected_params, expected_width): +def test_text_sizing_sequence(given_sequence, expected_text, expected_params, expected_width): + """Verify parsing and measured width of raw OSC 66 sequence.""" ts_match = TEXT_SIZING_PATTERN.match(given_sequence) assert ts_match is not None text_size = TextSizing.from_match(ts_match) @@ -99,19 +123,8 @@ def test_text_sizing_scale_width(given_sequence, expected_text, expected_params, assert wcwidth.width(given_sequence, control_codes='strict') == expected_width assert wcwidth.width(given_sequence, control_codes='ignore') == wcwidth.wcswidth(expected_text) - -WIDTH_PARSE_IGNORED_CASES = [ - # when control_codes='ignore', only the 'inner text' width is naturally - # measured, its -] - - -@pytest.mark.parametrize('text,expected', WIDTH_PARSE_IGNORED_CASES) -def test_width_text_sizing_ignored(text, expected): - assert wcwidth.width(text, control_codes='ignore') == expected - - -WIDTH_PARSE_CASES = [ +@pytest.mark.parametrize('text,expected', [ + ('\x1b]66;s=2:w=3:n=1:d=2:v=1:h=2;x!yzzy\x1b\\', 6), ('\x1b]66;s=2:w=3;anything\x07', 6), ('\x1b]66;w=3;x\x07', 3), ('\x1b]66;s=1:w=0;AB\x07', 2), @@ -122,20 +135,14 @@ def test_width_text_sizing_ignored(text, expected): ('\x1b]66;w=2;A\x07\x1b]66;w=3;B\x07', 5), ('\x1b]66;s=2:w=3;text\x1b\\', 6), ('\x1b[31m\x1b]66;w=2;AB\x07\x1b[0m', 2), -] - - -@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) -def test_width_text_sizing_parse(text, expected): +]) +def test_strings_with_text_sizing(text, expected): + """Verify measured width strings containing OSC66.""" assert wcwidth.width(text) == expected - - -@pytest.mark.parametrize('text,expected', WIDTH_PARSE_CASES) -def test_width_text_sizing_strict(text, expected): assert wcwidth.width(text, control_codes='strict') == expected -STRIP_TEXT_SIZING_CASES = [ +@pytest.mark.parametrize('text,expected', [ ('\x1b]66;s=2;hello\x07', 'hello'), ('\x1b]66;s=2;hello\x1b\\', 'hello'), ('\x1b]66;;text\x07', 'text'), @@ -143,53 +150,128 @@ def test_width_text_sizing_strict(text, expected): ('abc\x1b]66;w=2;XY\x07def', 'abcXYdef'), ('\x1b[31m\x1b]66;s=2;red\x07\x1b[0m', 'red'), ('\x1b]66;w=1;A\x07\x1b]66;w=1;B\x07', 'AB'), -] - - -@pytest.mark.parametrize('text,expected', STRIP_TEXT_SIZING_CASES) -def test_strip_sequences_text_sizing(text, expected): +]) +def test_strip_strings_with_text_sizing(text, expected): assert wcwidth.strip_sequences(text) == expected -def test_iter_sequences_text_sizing(): - text = 'abc\x1b]66;s=2;hello\x07def' - segments = list(wcwidth.iter_sequences(text)) - assert segments == [ - ('abc', False), - ('\x1b]66;s=2;hello\x07', True), - ('def', False), - ] - - -def test_iter_sequences_text_sizing_st(): - text = '\x1b]66;w=2;AB\x1b\\' - segments = list(wcwidth.iter_sequences(text)) - assert segments == [('\x1b]66;w=2;AB\x1b\\', True)] +@pytest.mark.parametrize('text,expected_segs', [ + ('abc\x1b]66;s=2;hello\x07def', [('abc', False), ('\x1b]66;s=2;hello\x07', True), ('def', False)]), + ('abc\x1b]66;s=2;n=1,d=2,w=3;hello\x1b\\def', [('abc', False), ('\x1b]66;s=2;n=1,d=2,w=3;hello\x1b\\', True), ('def', False)]), +]) +def test_iter_sequences_text_sizing(text, expected_segs): + assert list(wcwidth.iter_sequences(text)) == expected_segs -# ___REPLACE_PADDING_CASES = [ -# ('\x1b]66;w=3;x\x07', ' '), -# ('\x1b]66;s=2:w=2;AB\x07', ' '), -# ('abc\x1b]66;w=1;x\x07def', 'abc def'), -# ('no text sizing here', 'no text sizing here'), -# ] -# -# -# -# -# -# CONTROL_CODES_WIDTH_CASES = [ -# ('hi', dict(scale=2, width=1), '\x07', -# '\x1b]66;s=2:w=1;hi\x07'), -# ('AB', dict(scale=2, width=2), '\x1b\\', -# '\x1b]66;s=2:w=2;AB\x1b\\'), -# ('x', {}, '\x07', -# '\x1b]66;;x\x07'), -# ('', dict(scale=3, width=2), '\x07', -# '\x1b]66;s=3:w=2;\x07'), -# ] -# MAKE_SEQUENCE_CASES = [ - +@pytest.mark.parametrize('text,start,end,expected', [ + ('\x1b]66;w=3;ABC\x07', 0, 3, '\x1b]66;w=3;ABC\x07'), + ('\x1b]66;w=3;ABC\x07', 0, 2, '\x1b]66;w=2;AB\x07'), + ('\x1b]66;w=3;ABC\x07', 1, 3, '\x1b]66;w=2;BC\x07'), + ('ab\x1b]66;w=2;XY\x07cd', 0, 6, 'ab\x1b]66;w=2;XY\x07cd'), + ('ab\x1b]66;w=2;XY\x07cd', 0, 3, 'ab\x1b]66;w=1;X\x07'), + ('ab\x1b]66;w=2;XY\x07cd', 3, 6, '\x1b]66;w=1;Y\x07cd'), + ('ab\x1b]66;w=2;XY\x07cd', 4, 6, 'cd'), + ]) +def test_clip_text_sizing_basic(text, start, end, expected): + """Test basic support of clip() with text sizing sequence.""" + assert repr(wcwidth.clip(text, start, end)) == repr(expected) + +@pytest.mark.parametrize('text,start,end,expected', [ + ('\x1b]66;s=2;ABC\x07', 0, 0, ''), + ('\x1b]66;s=2;ABC\x07', 6, 6, ''), + ('\x1b]66;s=2;ABC\x07', 0, 2, '\x1b]66;s=2;A\x07'), + ('\x1b]66;s=2;ABC\x07', 0, 4, '\x1b]66;s=2;AB\x07'), + ('\x1b]66;s=2;ABC\x07', 0, 6, '\x1b]66;s=2;ABC\x07'), + ('\x1b]66;s=2;ABC\x07', 2, 6, '\x1b]66;s=2;BC\x07'), + ('\x1b]66;s=2;ABC\x07', 4, 6, '\x1b]66;s=2;C\x07'), + ]) +def test_clip_text_sizing_scaled(text, start, end, expected): + """Test support of clip() with scale=N arguments.""" + assert repr(wcwidth.clip(text, start, end)) == repr(expected) + +@pytest.mark.parametrize('text,start,end,expected', [ + # a b c + # === === === + # 012 345 678 + # . + # .. + # *a* + # *a* . + # ... *b* + # ... *b* . + # ... *b* .. + # ... *b* *c* + ('\x1b]66;s=3;ABC\x07', 0, 0, ''), + ('\x1b]66;s=3;ABC\x07', 0, 1, '.'), + ('\x1b]66;s=3;ABC\x07', 0, 2, '..'), + ('\x1b]66;s=3;ABC\x07', 0, 3, '\x1b]66;s=3;A\x07'), + ('\x1b]66;s=3;ABC\x07', 0, 4, '\x1b]66;s=3;A\x07.'), + ('\x1b]66;s=3;ABC\x07', 0, 5, '\x1b]66;s=3;A\x07..'), + ('\x1b]66;s=3;ABC\x07', 0, 6, '\x1b]66;s=3;AB\x07'), + ('\x1b]66;s=3;ABC\x07', 0, 7, '\x1b]66;s=3;AB\x07.'), + ('\x1b]66;s=3;ABC\x07', 0, 8, '\x1b]66;s=3;AB\x07..'), + ('\x1b]66;s=3;ABC\x07', 0, 9, '\x1b]66;s=3;ABC\x07'), + ('\x1b]66;s=3;ABC\x07', 0, 10, '\x1b]66;s=3;ABC\x07'), + # a b + # === === === + # 012 345 678 + # . 1, 2 + # .. 1, 3 + # .. . 1, 4 + # .. .. 1, 5 + # .. *b* 1, 6 + # .. *b* . 1, 7 + # .. *b* .. 1, 8 + # .. *b* *c* 1, 9 + ('\x1b]66;s=3;ABC\x07', 1, 1, ''), + ('\x1b]66;s=3;ABC\x07', 1, 2, '.'), + ('\x1b]66;s=3;ABC\x07', 1, 3, '..'), + ('\x1b]66;s=3;ABC\x07', 1, 4, '...'), + ('\x1b]66;s=3;ABC\x07', 1, 5, '....'), + ('\x1b]66;s=3;ABC\x07', 1, 6, '..\x1b]66;s=3;B\x07'), + ('\x1b]66;s=3;ABC\x07', 1, 7, '..\x1b]66;s=3;B\x07.'), + ('\x1b]66;s=3;ABC\x07', 1, 8, '..\x1b]66;s=3;BC\x07..'), + ('\x1b]66;s=3;ABC\x07', 1, 9, '..\x1b]66;s=3;BC\x07'), + ('\x1b]66;s=3;ABC\x07', 1, 10, '..\x1b]66;s=3;BC\x07'), + # two-thirds of string 'A' and half of string 'B' is fillchar + # ('\x1b]66;s=3;ABC\x07', 2, 4, '..'), + # half of string 'A' and all of string 'B' + # a b + # === === === + # 012 345 678 + # . 2, 3 + # . . 2, 4 + # . .. 2, 5 + # . *b* 2, 6 + # . *b* . 2, 7 + # . *b* .. 2, 8 + # . *b* *c* 2, 9 + ('\x1b]66;s=3;ABC\x07', 2, 2, ''), + ('\x1b]66;s=3;ABC\x07', 2, 3, '.'), + ('\x1b]66;s=3;ABC\x07', 2, 4, '..'), + ('\x1b]66;s=3;ABC\x07', 2, 5, '...'), + ('\x1b]66;s=3;ABC\x07', 2, 6, '.\x1b]66;s=3;B\x07'), + ('\x1b]66;s=3;ABC\x07', 2, 7, '.\x1b]66;s=3;B\x07.'), + ('\x1b]66;s=3;ABC\x07', 2, 8, '.\x1b]66;s=3;B\x07..'), + ('\x1b]66;s=3;ABC\x07', 2, 9, '.\x1b]66;s=3;BC\x07'), + ('\x1b]66;s=3;ABC\x07', 2, 10, '.\x1b]66;s=3;BC\x07'), + # and now 3:10, should be easy ... + ('\x1b]66;s=3;ABC\x07', 3, 3, ''), + ('\x1b]66;s=3;ABC\x07', 3, 4, '.'), + ('\x1b]66;s=3;ABC\x07', 3, 5, '..'), + ('\x1b]66;s=3;ABC\x07', 3, 6, '\x1b]66;s=3;B\x07'), + ('\x1b]66;s=3;ABC\x07', 3, 7, '\x1b]66;s=3;B\x07.'), + ('\x1b]66;s=3;ABC\x07', 3, 8, '\x1b]66;s=3;B\x07..'), + ('\x1b]66;s=3;ABC\x07', 3, 9, '\x1b]66;s=3;BC\x07'), + ('\x1b]66;s=3;ABC\x07', 3, 10, '\x1b]66;s=3;BC\x07'), + ]) +def test_clip_text_sizing_scaled_with_fillchar(text, start, end, expected): + """Test support of clip() with scale=N and fillchar is needed to fill remainder.""" + assert repr(wcwidth.clip(text, start, end, fillchar='.')) == repr(expected) + + + +# TODO wrap() cases, # # WRAP_CASES = [ # (TextSizingParams(scale=2, width=2), @@ -203,93 +285,7 @@ def test_iter_sequences_text_sizing_st(): # '\x1b]66;s=3:w=1:n=1:d=2:v=1:h=2;ABC\x1b\\'), # ] # -# @pytest.mark.parametrize('params,expected', WRAP_CASES) -# def test_wrap(params, expected): -# text = 'ABC' -# terminator = '\x1b\\' -# assert TextSizing(params, text, terminator).make_sequence() == expected -# -# def test_scale_st_terminator(): -# text, scale = 'AB', 2 -# inner_w = wcwidth.wcswidth(text) -# result = _build_seq(text, -# TextSizingParams(scale=scale, width=max(0, inner_w)), -# terminator='\x1b\\') -# assert result == '\x1b]66;s=2:w=2;AB\x1b\\' -# -# -# @pytest.mark.parametrize('text,kwargs,term,expected', MAKE_SEQUENCE_CASES) -# def test_make_sequence(text, kwargs, term, expected): -# assert TextSizing(text, terminator=term, **kwargs) == expected -# -# -# @pytest.mark.parametrize('raw,expected', PARSE_PARAMS_EDGE_CASES) -# def test_parse_text_sizing_params_edge(raw, expected): -# assert _parse_text_sizing_params(raw) == expected -# -# -# PARAMS_ROUNDTRIP_CASES = [ -# TextSizingParams(), -# TextSizingParams(scale=3), -# TextSizingParams(scale=2, width=5), -# TextSizingParams(scale=7, width=7, numerator=15, denominator=15, -# vertical_align=2, horizontal_align=2), -# TextSizingParams(numerator=1, denominator=2), -# ] -# -# @pytest.mark.parametrize('params', PARAMS_ROUNDTRIP_CASES) -# def test_params_roundtrip(params): -# text_size = TextSizing(params, "abc", terminator="\x07") -# #assert _parse_text_sizing_params(_make_params_str(params)) == params - -# PARSE_PARAMS_CASES = [ -# ('', TextSizingParams()), -# ('s=2', TextSizingParams(scale=2)), -# ('w=3', TextSizingParams(width=3)), -# ('s=2:w=3', TextSizingParams(scale=2, width=3)), -# ('s=2:w=3:n=1:d=2:v=1:h=2', -# TextSizingParams(scale=2, width=3, numerator=1, denominator=2, -# vertical_align=1, horizontal_align=2)), -# ('n=5:d=10', TextSizingParams(numerator=5, denominator=10)), -# ('v=0:h=0', TextSizingParams()), -# ('s=1:w=0', TextSizingParams()), -# ] - -# PARSE_SEQUENCE_CASES = [ -# ('\x1b]66;s=2;hello\x07', -# (TextSizingParams(scale=2), 'hello', '\x07')), -# ('\x1b]66;s=99;hello\x07', -# (TextSizingParams(scale=TextSizingParams.FIELD_MAPPING['s'].high), 'hello', '\x07')), -# ('\x1b]66;s=-99;hello\x07', -# (TextSizingParams(scale=TextSizingParams.FIELD_MAPPING['s'].low), 'hello', '\x07')), -# ('\x1b]66;s=2;hello\x1b\\', -# (TextSizingParams(scale=2), 'hello', '\x1b\\')), -# ('\x1b]66;;text\x07', -# (TextSizingParams(), 'text', '\x07')), -# ('\x1b]66;s=3:w=2;\x07', -# (TextSizingParams(scale=3, width=2), '', '\x07')), -# ('\x1b]66;w=5;AB\x07', -# (TextSizingParams(width=5), 'AB', '\x07')), -# ('\x1b]66;s=7;' + ('X' * 30) + '\x07', -# (TextSizingParams(scale=7), 'X' * 30, '\x07')), -# ] - # # @pytest.mark.parametrize('seq,expected', PARSE_SEQUENCE_CASES) # def test_parse_text_sizing(seq, expected): # assert parse_text_sizing(seq) == expected - - -CLIP_TEXT_SIZING_CASES = [ - ('\x1b]66;w=3;ABC\x07', 0, 3, '\x1b]66;w=3;ABC\x07'), - ('\x1b]66;w=3;ABC\x07', 0, 2, ' '), - ('\x1b]66;w=3;ABC\x07', 1, 3, ' '), - ('ab\x1b]66;w=2;XY\x07cd', 0, 6, 'ab\x1b]66;w=2;XY\x07cd'), - ('ab\x1b]66;w=2;XY\x07cd', 0, 3, 'ab '), - ('ab\x1b]66;w=2;XY\x07cd', 4, 6, 'cd'), -] - - -@pytest.mark.parametrize('text,start,end,expected', CLIP_TEXT_SIZING_CASES) -def test_clip_text_sizing(text, start, end, expected): - assert wcwidth.clip(text, start, end) == expected diff --git a/wcwidth/_width.py b/wcwidth/_width.py index f04ce68..d01a7a4 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -157,29 +157,25 @@ def width( # 1. Handle ESC sequences if char == '\x1b': - # 1a. OSC 66 (kitty text sizing) positive width - if (ts_match := TEXT_SIZING_PATTERN.match(text, idx)): - text_size = TextSizing.from_match(ts_match, control_codes=control_codes) - current_col += text_size.display_width(ambiguous_width) - max_extent = max(max_extent, current_col) - idx = ts_match.end() - continue - # 1b. Check all other "zero-width" terminal sequences - match = ZERO_WIDTH_PATTERN.match(text, idx) - if match: + # Check for all terminal sequences + if (match := ZERO_WIDTH_PATTERN.match(text, idx)): seq = match.group() if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): raise ValueError(f"Indeterminate cursor sequence at position {idx}") - # Apply cursor movement - right = CURSOR_RIGHT_SEQUENCE.match(seq) - if right: + + # Apply cursor movement, + if (right := CURSOR_RIGHT_SEQUENCE.match(seq)): current_col += int(right.group(1) or 1) - else: - left = CURSOR_LEFT_SEQUENCE.match(seq) - if left: - current_col = max(0, current_col - int(left.group(1) or 1)) + elif (left := CURSOR_LEFT_SEQUENCE.match(seq)): + current_col = max(0, current_col - int(left.group(1) or 1)) + + # Or OSC 66 (kitty text sizing) + elif (ts_match := TEXT_SIZING_PATTERN.match(seq)): + text_size = TextSizing.from_match(ts_match, control_codes=control_codes) + current_col += text_size.display_width(ambiguous_width) idx = match.end() else: + # Errant ESC or unknown sequence: only the first character is zero-width idx += 1 max_extent = max(max_extent, current_col) continue diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index ba51d1c..60a6b7a 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -22,7 +22,8 @@ ZERO_WIDTH_PATTERN = re.compile( # CSI sequences r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]|' - # OSC sequences + # OSC sequences, note that text sizing protocol (OSC 66) is special case in width() and clip(), + # and contrary to the variable name, it is positive width. r'\x1b\][^\x07\x1b]*(?:\x07|\x1b\\)|' # APC sequences r'\x1b_[^\x1b\x07]*(?:\x07|\x1b\\)|' diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py index 45be03a..133023d 100644 --- a/wcwidth/text_sizing.py +++ b/wcwidth/text_sizing.py @@ -42,15 +42,16 @@ class _FieldMeta(typing.NamedTuple): name: str low: int high: int + default: int TEXT_FIELD_MAPPING: dict[str, _FieldMeta] = { - 's': _FieldMeta(name='scale', low=1, high=7), - 'w': _FieldMeta(name='width', low=0, high=7), - 'n': _FieldMeta(name='numerator', low=0, high=15), - 'd': _FieldMeta(name='denominator', low=0, high=15), - 'v': _FieldMeta(name='vertical_align', low=0, high=2), - 'h': _FieldMeta(name='horizontal_align', low=0, high=2)} + 's': _FieldMeta(name='scale', low=1, high=7, default=1), + 'w': _FieldMeta(name='width', low=0, high=7, default=0), + 'n': _FieldMeta(name='numerator', low=0, high=15, default=0), + 'd': _FieldMeta(name='denominator', low=0, high=15, default=0), + 'v': _FieldMeta(name='vertical_align', low=0, high=2, default=0), + 'h': _FieldMeta(name='horizontal_align', low=0, high=2, default=0)} class TextSizingParams(typing.NamedTuple): @@ -74,15 +75,20 @@ class TextSizingParams(typing.NamedTuple): vertical_align: int = 0 horizontal_align: int = 0 + def __repr__(self): + # modified to show values only when non-default + repr_fmt = ', '.join(f'{field.name}={getattr(self, field.name)}' + for field in TEXT_FIELD_MAPPING.values() + if getattr(self, field.name) != field.default) + return f'{self.__class__.__name__}({repr_fmt})' + + def make_sequence(self) -> str: """Build and return sub-part of an OSC 66 sequence.""" parts = [] - default_params = TextSizingParams() # build string for all known parameters of non-default values for field_key, field in TEXT_FIELD_MAPPING.items(): - val = getattr(self, field.name) - default_val = getattr(default_params, field.name) - if val != default_val: + if (val := getattr(self, field.name)) != field.default: parts.append(f'{field_key}={val}') return ':'.join(parts) @@ -105,11 +111,13 @@ def from_params(cls, raw: str, control_codes: str = 'parse') -> TextSizingParams vertical_align=0, horizontal_align=0) """ kwargs: typing.Dict[str, int] = {} + if not raw: + return cls() for part in raw.split(':'): if '=' not in part: if control_codes == 'strict': raise ValueError(f"Expected '=' in text sizing parameter (key=val), " - f"got {part!r}") + f"got {part!r} in OSC 66 sequence, {raw!r}") continue key, _eq, val = part.partition('=') field = TEXT_FIELD_MAPPING.get(key) diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index ed07397..f0dd75c 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -91,7 +91,7 @@ from .table_vs16 import VS16_NARROW_TO_WIDE from .table_wide import WIDE_EASTASIAN from .table_zero import ZERO_WIDTH -from .text_sizing import TextSizing +from .text_sizing import TextSizing, TextSizingParams from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL from .table_grapheme import ISC_CONSONANT from .table_ambiguous import AMBIGUOUS_EASTASIAN @@ -343,6 +343,9 @@ def clip( .. versionchanged:: 0.5.0 Added ``propagate_sgr`` parameter (default True). + .. versionchanged:: 0.6.1 + Parses OSC 66 Sequences. + Example:: >>> clip('hello world', 0, 5) @@ -386,7 +389,7 @@ def clip( if col >= end and sgr_at_clip_start is not None and char != '\x1b': break - # Handle escape sequences + # 1. Handle escape sequences if char == '\x1b': if (match := ZERO_WIDTH_PATTERN.match(text, idx)): seq = match.group() @@ -394,42 +397,75 @@ def clip( # Update SGR state; will be applied as prefix when visible content starts sgr = _sgr_state_update(sgr, seq) else: - # Non-SGR sequences always preserved - output.append(seq) - idx = match.end() - continue - - # OSC 66 (text sizing) has positive width, handle before zero-width path - if (ts_match := TEXT_SIZING_PATTERN.match(text, idx)): - text_size = TextSizing.from_match(ts_match, control_codes='parse') - w = text_size.display_width(ambiguous_width) - if col >= start and col + w <= end: - # fits as-is, keep going - output.append(ts_match.group()) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - col += w - elif col < end and col + w > start: - # TODO: currently we just replace it entirely with '***', - # when, we should instead "chop up" the text to fit .. - # this function is sparingly used, but it should handle OSC 66 correctly - visible = min(end, col + w) - max(start, col) - output.append(fillchar * visible) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - col += w - else: - col += w - idx = ts_match.end() - continue + # Non-SGR and Non-Text Sizing sequences always preserved + # TODO: what about cursor_left and right! preserved, or padded ?! + ts_match = TEXT_SIZING_PATTERN.match(text, idx) + if ts_match is None: + idx = match.end() + continue + + # OSC 66 (text sizing) has positive width + text_size = TextSizing.from_match(ts_match, control_codes='parse') + ts_width = text_size.display_width(ambiguous_width) + if col >= start and col + ts_width <= end: + # fits as-is, keep going + output.append(seq) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + col += ts_width + elif col < end and col + ts_width > start: + # TODO: move to TextSizing.clip(start, end) + # TODO: fillchar padding + next_start = max(0, start - col) // text_size.params.scale + visible_width = (min(end, col + ts_width) - max(start, col)) + next_width = (0 if text_size.params.width == 0 + else visible_width // text_size.params.scale) + next_text_size_parms = TextSizingParams( + text_size.params.scale, + next_width, + text_size.params.numerator, + text_size.params.denominator, + text_size.params.vertical_align, + text_size.params.horizontal_align) + + # RECURSION just one time for clip() of inner text. Text sizing + # sequences cannot further contain any sequences. Although tabsize + # is "extended", the modulo margins are not. + next_inner_text = clip( + text_size.text, next_start, next_start + visible_width // text_size.params.scale, + fillchar=fillchar, tabsize=tabsize, ambiguous_width=ambiguous_width, + propagate_sgr=False) + next_text_size = TextSizing( + next_text_size_parms, + next_inner_text, + text_size.terminator) + + delta = next_text_size.display_width() - visible_width + #breakpoint() + if delta > 0: + # left-pad ?? + output.append(delta * fillchar) + if next_inner_text: + output.append(next_text_size.make_sequence()) + if delta < 0: + # or right-pad?? how do we do eeet TODO + output.append(abs(delta) * fillchar) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + col += ts_width + else: + col += ts_width + idx = ts_match.end() + continue + - # Handle bare ESC (not a valid sequence) + # 2. Handle bare ESC (not a valid sequence) if char == '\x1b': output.append(char) idx += 1 continue - # TAB expansion + # 3. TAB expansion if char == '\t': if tabsize > 0: next_tab = col + (tabsize - (col % tabsize)) @@ -444,27 +480,29 @@ def clip( idx += 1 continue - # Grapheme clustering for everything else + # 4. Grapheme clustering for everything else grapheme = next(iter_graphemes(text, start=idx)) - w = width(grapheme, ambiguous_width=ambiguous_width) + grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) - if w == 0: + if grapheme_w == 0: + # TODO: How is this reachable ?? if start <= col < end: output.append(grapheme) - elif col >= start and col + w <= end: + elif col >= start and col + grapheme_w <= end: # Fully visible output.append(grapheme) if propagate_sgr and sgr_at_clip_start is None: sgr_at_clip_start = sgr - col += w - elif col < end and col + w > start: + col += grapheme_w + elif col < end and col + grapheme_w > start: # Partially visible (wide char at boundary) - output.append(fillchar * (min(end, col + w) - max(start, col))) + output.append(fillchar * (min(end, col + grapheme_w) - max(start, col))) if propagate_sgr and sgr_at_clip_start is None: sgr_at_clip_start = sgr - col += w + col += grapheme_w else: - col += w + # TODO and this?? + col += grapheme_w idx += len(grapheme) From 0edd6633e838d83d199787c924a0355257181256 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Tue, 28 Apr 2026 15:31:49 -0400 Subject: [PATCH 20/70] finish clip() of text sizing and left/right --- tests/test_clip_cjk_emoji.py | 46 +++++ tests/test_cursor_sequences_clip.py | 30 +++ tests/test_text_sizing.py | 59 ++---- wcwidth/__init__.py | 15 +- wcwidth/_width.py | 2 +- wcwidth/table_grapheme.py | 48 ++--- wcwidth/table_mc.py | 8 +- wcwidth/table_wide.py | 20 +- wcwidth/table_zero.py | 18 +- wcwidth/text_sizing.py | 5 +- wcwidth/wcwidth.py | 309 ++++++++++++++++++++++------ 11 files changed, 407 insertions(+), 153 deletions(-) create mode 100644 tests/test_clip_cjk_emoji.py create mode 100644 tests/test_cursor_sequences_clip.py diff --git a/tests/test_clip_cjk_emoji.py b/tests/test_clip_cjk_emoji.py new file mode 100644 index 0000000..64795a1 --- /dev/null +++ b/tests/test_clip_cjk_emoji.py @@ -0,0 +1,46 @@ +""" +Tests for clip() with CJK and Emoji characters. + +These ensure wide graphemes (CJK / emoji / ZWJ sequences) are clipped correctly: +- Partial columns of a wide grapheme are replaced by fillchar. +- Full grapheme included when fully inside slice. +""" +# 3rd party +import pytest + +# local +from wcwidth import clip, width + + +@pytest.mark.parametrize("ch", [ + "中", + "🙂", + "👨\u200d👩\u200d👧", # family ZWJ + "👩\u200d👩\u200d👧" # another ZWJ variant +]) +def test_partial_and_full_wide_grapheme(ch): + w = width(ch) + assert w >= 1 + if w > 1: + # partial clip of first column -> fillchar + assert clip(ch, 0, 1) == ' ' + # full clip covering entire grapheme -> original grapheme + assert clip(ch, 0, w) == ch + # width of clipped full grapheme should match + assert width(clip(ch, 0, w)) == w + else: + # narrow grapheme: trivial + assert clip(ch, 0, 1) == ch + + +def test_mixed_cjk_emoji_sequence(): + text = 'A中🙂B' + total_w = width(text) + # sanity + assert total_w >= 4 + # pick a slice that includes the middle two columns (center of string) + # ensure clip doesn't raise and width matches requested slice + start = 1 + end = 4 + out = clip(text, start, end) + assert width(out) == (end - start) diff --git a/tests/test_cursor_sequences_clip.py b/tests/test_cursor_sequences_clip.py new file mode 100644 index 0000000..d424715 --- /dev/null +++ b/tests/test_cursor_sequences_clip.py @@ -0,0 +1,30 @@ +""" +Tests for clip() handling of cursor left/right sequences (CSI C / CSI D). + +These tests codify expected visible results when cursor movement sequences affect horizontal +positions. They are intentionally specific and will drive future implementation changes in clip(). +""" +# 3rd party +import pytest + +# local +from wcwidth import clip + + +@pytest.mark.parametrize("text,start,end,expected", [ + # Cursor-right introduces a gap that should be filled with spaces + ("hello\x1b[10Cworld", 0, 10, "hello" + " " * 5), + # Clipping just the initial region ignores the later rightward write + ("hello\x1b[10Cworld", 0, 5, "hello"), + # Cursor-left overwrites previous characters + ("hello\x1b[2DXY", 0, 5, "helXY"), +]) +def test_clip_cursor_sequences_expected_behaviour(text, start, end, expected): + """ + Verify clip() output matches terminal-visible columns after cursor moves. + + These tests capture the desired semantics: cursor-right creates blank cells (fillchar) in + the clipped output if the moved-to columns are within the clip window; cursor-left allows + subsequent characters to overwrite previous content and the clip should reflect that. + """ + assert clip(text, start, end) == expected diff --git a/tests/test_text_sizing.py b/tests/test_text_sizing.py index a9fc905..cd3421a 100644 --- a/tests/test_text_sizing.py +++ b/tests/test_text_sizing.py @@ -30,6 +30,7 @@ ('h=3', "h=2", "Out of bounds text sizing value '3' in "), ] + @pytest.mark.parametrize('given_params,expected_remainder,expected_exc,', CONTROL_CODES_PARAMS_CASES) def test_text_sizing_params_control_codes(given_params, expected_remainder, expected_exc): """Verify control_codes='strict' and 'parse' behavior in TextSizingParams.from_params().""" @@ -123,6 +124,7 @@ def test_text_sizing_sequence(given_sequence, expected_text, expected_params, ex assert wcwidth.width(given_sequence, control_codes='strict') == expected_width assert wcwidth.width(given_sequence, control_codes='ignore') == wcwidth.wcswidth(expected_text) + @pytest.mark.parametrize('text,expected', [ ('\x1b]66;s=2:w=3:n=1:d=2:v=1:h=2;x!yzzy\x1b\\', 6), ('\x1b]66;s=2:w=3;anything\x07', 6), @@ -171,11 +173,12 @@ def test_iter_sequences_text_sizing(text, expected_segs): ('ab\x1b]66;w=2;XY\x07cd', 0, 3, 'ab\x1b]66;w=1;X\x07'), ('ab\x1b]66;w=2;XY\x07cd', 3, 6, '\x1b]66;w=1;Y\x07cd'), ('ab\x1b]66;w=2;XY\x07cd', 4, 6, 'cd'), - ]) +]) def test_clip_text_sizing_basic(text, start, end, expected): - """Test basic support of clip() with text sizing sequence.""" + """Test basic support of clip() with text sizing sequence.""" assert repr(wcwidth.clip(text, start, end)) == repr(expected) + @pytest.mark.parametrize('text,start,end,expected', [ ('\x1b]66;s=2;ABC\x07', 0, 0, ''), ('\x1b]66;s=2;ABC\x07', 6, 6, ''), @@ -184,17 +187,18 @@ def test_clip_text_sizing_basic(text, start, end, expected): ('\x1b]66;s=2;ABC\x07', 0, 6, '\x1b]66;s=2;ABC\x07'), ('\x1b]66;s=2;ABC\x07', 2, 6, '\x1b]66;s=2;BC\x07'), ('\x1b]66;s=2;ABC\x07', 4, 6, '\x1b]66;s=2;C\x07'), - ]) +]) def test_clip_text_sizing_scaled(text, start, end, expected): - """Test support of clip() with scale=N arguments.""" + """Test support of clip() with scale=N arguments.""" assert repr(wcwidth.clip(text, start, end)) == repr(expected) + @pytest.mark.parametrize('text,start,end,expected', [ # a b c # === === === # 012 345 678 - # . - # .. + # . + # .. # *a* # *a* . # ... *b* @@ -223,15 +227,15 @@ def test_clip_text_sizing_scaled(text, start, end, expected): # .. *b* . 1, 7 # .. *b* .. 1, 8 # .. *b* *c* 1, 9 - ('\x1b]66;s=3;ABC\x07', 1, 1, ''), - ('\x1b]66;s=3;ABC\x07', 1, 2, '.'), - ('\x1b]66;s=3;ABC\x07', 1, 3, '..'), - ('\x1b]66;s=3;ABC\x07', 1, 4, '...'), - ('\x1b]66;s=3;ABC\x07', 1, 5, '....'), - ('\x1b]66;s=3;ABC\x07', 1, 6, '..\x1b]66;s=3;B\x07'), - ('\x1b]66;s=3;ABC\x07', 1, 7, '..\x1b]66;s=3;B\x07.'), - ('\x1b]66;s=3;ABC\x07', 1, 8, '..\x1b]66;s=3;BC\x07..'), - ('\x1b]66;s=3;ABC\x07', 1, 9, '..\x1b]66;s=3;BC\x07'), + ('\x1b]66;s=3;ABC\x07', 1, 1, ''), + ('\x1b]66;s=3;ABC\x07', 1, 2, '.'), + ('\x1b]66;s=3;ABC\x07', 1, 3, '..'), + ('\x1b]66;s=3;ABC\x07', 1, 4, '...'), + ('\x1b]66;s=3;ABC\x07', 1, 5, '....'), + ('\x1b]66;s=3;ABC\x07', 1, 6, '..\x1b]66;s=3;B\x07'), + ('\x1b]66;s=3;ABC\x07', 1, 7, '..\x1b]66;s=3;B\x07.'), + ('\x1b]66;s=3;ABC\x07', 1, 8, '..\x1b]66;s=3;B\x07..'), + ('\x1b]66;s=3;ABC\x07', 1, 9, '..\x1b]66;s=3;BC\x07'), ('\x1b]66;s=3;ABC\x07', 1, 10, '..\x1b]66;s=3;BC\x07'), # two-thirds of string 'A' and half of string 'B' is fillchar # ('\x1b]66;s=3;ABC\x07', 2, 4, '..'), @@ -264,28 +268,7 @@ def test_clip_text_sizing_scaled(text, start, end, expected): ('\x1b]66;s=3;ABC\x07', 3, 8, '\x1b]66;s=3;B\x07..'), ('\x1b]66;s=3;ABC\x07', 3, 9, '\x1b]66;s=3;BC\x07'), ('\x1b]66;s=3;ABC\x07', 3, 10, '\x1b]66;s=3;BC\x07'), - ]) +]) def test_clip_text_sizing_scaled_with_fillchar(text, start, end, expected): - """Test support of clip() with scale=N and fillchar is needed to fill remainder.""" + """Test support of clip() with scale=N and fillchar is needed to fill remainder.""" assert repr(wcwidth.clip(text, start, end, fillchar='.')) == repr(expected) - - - -# TODO wrap() cases, -# -# WRAP_CASES = [ -# (TextSizingParams(scale=2, width=2), -# '\x1b]66;s=2:w=2;ABC\x1b\\'), -# (TextSizingParams(scale=2, width=2), -# '\x1b]66;s=2:w=2;ABC\x1b\\'), -# (TextSizingParams(scale=1), -# '\x1b]66;;ABC\x1b\\'), -# (TextSizingParams(scale=3, width=1, numerator=1, denominator=2, -# vertical_align=1, horizontal_align=2), -# '\x1b]66;s=3:w=1:n=1:d=2:v=1:h=2;ABC\x1b\\'), -# ] -# -# -# @pytest.mark.parametrize('seq,expected', PARSE_SEQUENCE_CASES) -# def test_parse_text_sizing(seq, expected): -# assert parse_text_sizing(seq) == expected diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index d60fdf1..b4b4494 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -41,4 +41,17 @@ # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places -__version__ = '0.6.1' # don't forget to also update pyproject.toml:version +# Prefer the installed distribution version when available (helps test environments) +try: + # std imports + from importlib import metadata as importlib_metadata +except ImportError: # pragma: no cover - fallback for very old Pythons + importlib_metadata = None + +if importlib_metadata is not None: + try: + __version__ = importlib_metadata.version('wcwidth') + except importlib_metadata.PackageNotFoundError: + __version__ = '0.6.1' +else: + __version__ = '0.6.1' # don't forget to also update pyproject.toml:version diff --git a/wcwidth/_width.py b/wcwidth/_width.py index d01a7a4..9dc68ed 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -168,7 +168,7 @@ def width( current_col += int(right.group(1) or 1) elif (left := CURSOR_LEFT_SEQUENCE.match(seq)): current_col = max(0, current_col - int(left.group(1) or 1)) - + # Or OSC 66 (kitty text sizing) elif (ts_match := TEXT_SIZING_PATTERN.match(seq)): text_size = TextSizing.from_match(ts_match, control_codes=control_codes) diff --git a/wcwidth/table_grapheme.py b/wcwidth/table_grapheme.py index 42fd19e..b9cfdc4 100644 --- a/wcwidth/table_grapheme.py +++ b/wcwidth/table_grapheme.py @@ -4,7 +4,7 @@ This module provides lookup tables for Unicode grapheme cluster break properties as defined in UAX #29: Unicode Text Segmentation. -This code generated by wcwidth/bin/update-tables.py on 2026-01-29 23:33:42 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-28 19:35:59 UTC. """ # pylint: disable=duplicate-code @@ -202,8 +202,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B - (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa + (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) + (0x01ae0, 0x01aeb,), # (nil) (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b3d,), # Balinese Sign Rerekan ..Balinese Vowel Sign La L (0x01b42, 0x01b44,), # Balinese Vowel Sign Pepe..Balinese Adeg Adeg @@ -284,7 +284,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11001, 0x11001,), # Brahmi Sign Anusvara @@ -367,9 +367,9 @@ (0x11a59, 0x11a5b,), # Soyombo Vowel Sign Vocal..Soyombo Vowel Length Mar (0x11a8a, 0x11a96,), # Soyombo Final Consonant ..Soyombo Sign Anusvara (0x11a98, 0x11a99,), # Soyombo Gemination Mark ..Soyombo Subjoiner - (0x11b60, 0x11b60,), # Sharada Vowel Sign Oe - (0x11b62, 0x11b64,), # Sharada Vowel Sign Ue ..Sharada Vowel Sign Short - (0x11b66, 0x11b66,), # Sharada Vowel Sign Candra E + (0x11b60, 0x11b60,), # (nil) + (0x11b62, 0x11b64,), # (nil) + (0x11b66, 0x11b66,), # (nil) (0x11c30, 0x11c36,), # Bhaiksuki Vowel Sign I ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3d,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Anusvara (0x11c3f, 0x11c3f,), # Bhaiksuki Sign Virama @@ -426,10 +426,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue - (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au - (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang - (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om + (0x1e6e3, 0x1e6e3,), # (nil) + (0x1e6e6, 0x1e6e6,), # (nil) + (0x1e6ee, 0x1e6ef,), # (nil) + (0x1e6f5, 0x1e6f5,), # (nil) (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri @@ -617,9 +617,9 @@ (0x11a39, 0x11a39,), # Zanabazar Square Sign Visarga (0x11a57, 0x11a58,), # Soyombo Vowel Sign Ai ..Soyombo Vowel Sign Au (0x11a97, 0x11a97,), # Soyombo Sign Visarga - (0x11b61, 0x11b61,), # Sharada Vowel Sign Ooe - (0x11b65, 0x11b65,), # Sharada Vowel Sign Short O - (0x11b67, 0x11b67,), # Sharada Vowel Sign Candra O + (0x11b61, 0x11b61,), # (nil) + (0x11b65, 0x11b65,), # (nil) + (0x11b67, 0x11b67,), # (nil) (0x11c2f, 0x11c2f,), # Bhaiksuki Vowel Sign Aa (0x11c3e, 0x11c3e,), # Bhaiksuki Sign Visarga (0x11ca9, 0x11ca9,), # Marchen Subjoined Letter Ya @@ -1892,8 +1892,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B - (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa + (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) + (0x01ae0, 0x01aeb,), # (nil) (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b3d,), # Balinese Sign Rerekan ..Balinese Vowel Sign La L (0x01b42, 0x01b43,), # Balinese Vowel Sign Pepe..Balinese Vowel Sign Pepe @@ -1972,7 +1972,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11001, 0x11001,), # Brahmi Sign Anusvara @@ -2055,9 +2055,9 @@ (0x11a59, 0x11a5b,), # Soyombo Vowel Sign Vocal..Soyombo Vowel Length Mar (0x11a8a, 0x11a96,), # Soyombo Final Consonant ..Soyombo Sign Anusvara (0x11a98, 0x11a98,), # Soyombo Gemination Mark - (0x11b60, 0x11b60,), # Sharada Vowel Sign Oe - (0x11b62, 0x11b64,), # Sharada Vowel Sign Ue ..Sharada Vowel Sign Short - (0x11b66, 0x11b66,), # Sharada Vowel Sign Candra E + (0x11b60, 0x11b60,), # (nil) + (0x11b62, 0x11b64,), # (nil) + (0x11b66, 0x11b66,), # (nil) (0x11c30, 0x11c36,), # Bhaiksuki Vowel Sign I ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3d,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Anusvara (0x11c3f, 0x11c3f,), # Bhaiksuki Sign Virama @@ -2114,10 +2114,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue - (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au - (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang - (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om + (0x1e6e3, 0x1e6e3,), # (nil) + (0x1e6e6, 0x1e6e6,), # (nil) + (0x1e6ee, 0x1e6ef,), # (nil) + (0x1e6f5, 0x1e6f5,), # (nil) (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri diff --git a/wcwidth/table_mc.py b/wcwidth/table_mc.py index 7c2e691..e0ee591 100644 --- a/wcwidth/table_mc.py +++ b/wcwidth/table_mc.py @@ -1,7 +1,7 @@ """ Exports CATEGORY_MC table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-29 00:47:54 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-28 19:35:59 UTC. """ # pylint: disable=duplicate-code CATEGORY_MC = { @@ -181,9 +181,9 @@ (0x11a39, 0x11a39,), # Zanabazar Square Sign Visarga (0x11a57, 0x11a58,), # Soyombo Vowel Sign Ai ..Soyombo Vowel Sign Au (0x11a97, 0x11a97,), # Soyombo Sign Visarga - (0x11b61, 0x11b61,), # Sharada Vowel Sign Ooe - (0x11b65, 0x11b65,), # Sharada Vowel Sign Short O - (0x11b67, 0x11b67,), # Sharada Vowel Sign Candra O + (0x11b61, 0x11b61,), # (nil) + (0x11b65, 0x11b65,), # (nil) + (0x11b67, 0x11b67,), # (nil) (0x11c2f, 0x11c2f,), # Bhaiksuki Vowel Sign Aa (0x11c3e, 0x11c3e,), # Bhaiksuki Sign Visarga (0x11ca9, 0x11ca9,), # Marchen Subjoined Letter Ya diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py index ed6f48a..898734c 100644 --- a/wcwidth/table_wide.py +++ b/wcwidth/table_wide.py @@ -1,7 +1,7 @@ """ Exports WIDE_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:58:17 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-28 19:35:59 UTC. """ # pylint: disable=duplicate-code WIDE_EASTASIAN = { @@ -71,10 +71,10 @@ (0x0ff01, 0x0ff60,), # Fullwidth Exclamation Ma..Fullwidth Right White Pa (0x0ffe0, 0x0ffe6,), # Fullwidth Cent Sign ..Fullwidth Won Sign (0x16fe0, 0x16fe3,), # Tangut Iteration Mark ..Old Chinese Iteration Ma - (0x16ff2, 0x16ff6,), # Chinese Small Simplified..Yangqin Sign Slow Two Be + (0x16ff2, 0x16ff6,), # (nil) (0x17000, 0x18cd5,), # (nil) ..Khitan Small Script Char (0x18cff, 0x18d1e,), # Khitan Small Script Char..(nil) - (0x18d80, 0x18df2,), # Tangut Component-769 ..Tangut Component-883 + (0x18d80, 0x18df2,), # (nil) (0x1aff0, 0x1aff3,), # Katakana Letter Minnan T..Katakana Letter Minnan T (0x1aff5, 0x1affb,), # Katakana Letter Minnan T..Katakana Letter Minnan N (0x1affd, 0x1affe,), # Katakana Letter Minnan N..Katakana Letter Minnan N @@ -116,7 +116,7 @@ (0x1f680, 0x1f6c5,), # Rocket ..Left Luggage (0x1f6cc, 0x1f6cc,), # Sleeping Accommodation (0x1f6d0, 0x1f6d2,), # Place Of Worship ..Shopping Trolley - (0x1f6d5, 0x1f6d8,), # Hindu Temple ..Landslide + (0x1f6d5, 0x1f6d8,), # Hindu Temple ..(nil) (0x1f6dc, 0x1f6df,), # Wireless ..Ring Buoy (0x1f6eb, 0x1f6ec,), # Airplane Departure ..Airplane Arriving (0x1f6f4, 0x1f6fc,), # Scooter ..Roller Skate @@ -126,12 +126,12 @@ (0x1f93c, 0x1f945,), # Wrestlers ..Goal Net (0x1f947, 0x1f9ff,), # First Place Medal ..Nazar Amulet (0x1fa70, 0x1fa7c,), # Ballet Shoes ..Crutch - (0x1fa80, 0x1fa8a,), # Yo-yo ..Trombone - (0x1fa8e, 0x1fac6,), # Treasure Chest ..Fingerprint - (0x1fac8, 0x1fac8,), # Hairy Creature - (0x1facd, 0x1fadc,), # Orca ..Root Vegetable - (0x1fadf, 0x1faea,), # Splatter ..Distorted Face - (0x1faef, 0x1faf8,), # Fight Cloud ..Rightwards Pushing Hand + (0x1fa80, 0x1fa8a,), # Yo-yo ..(nil) + (0x1fa8e, 0x1fac6,), # (nil) ..Fingerprint + (0x1fac8, 0x1fac8,), # (nil) + (0x1facd, 0x1fadc,), # (nil) ..Root Vegetable + (0x1fadf, 0x1faea,), # Splatter ..(nil) + (0x1faef, 0x1faf8,), # (nil) ..Rightwards Pushing Hand (0x20000, 0x2fffd,), # Cjk Unified Ideograph-20..(nil) (0x30000, 0x3fffd,), # Cjk Unified Ideograph-30..(nil) ), diff --git a/wcwidth/table_zero.py b/wcwidth/table_zero.py index c440bfc..0e8108d 100644 --- a/wcwidth/table_zero.py +++ b/wcwidth/table_zero.py @@ -1,7 +1,7 @@ """ Exports ZERO_WIDTH table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:48:24 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-28 19:35:59 UTC. """ # pylint: disable=duplicate-code ZERO_WIDTH = { @@ -147,8 +147,8 @@ (0x01a55, 0x01a5e,), # Tai Tham Consonant Sign ..Tai Tham Consonant Sign (0x01a60, 0x01a7c,), # Tai Tham Sign Sakot ..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B - (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa + (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) + (0x01ae0, 0x01aeb,), # (nil) (0x01b00, 0x01b04,), # Balinese Sign Ulu Ricem ..Balinese Sign Bisah (0x01b34, 0x01b44,), # Balinese Sign Rerekan ..Balinese Adeg Adeg (0x01b6b, 0x01b73,), # Balinese Musical Symbol ..Balinese Musical Symbol @@ -222,7 +222,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11000, 0x11002,), # Brahmi Sign Candrabindu ..Brahmi Sign Visarga @@ -284,7 +284,7 @@ (0x11a47, 0x11a47,), # Zanabazar Square Subjoiner (0x11a51, 0x11a5b,), # Soyombo Vowel Sign I ..Soyombo Vowel Length Mar (0x11a8a, 0x11a99,), # Soyombo Final Consonant ..Soyombo Subjoiner - (0x11b60, 0x11b67,), # Sharada Vowel Sign Oe ..Sharada Vowel Sign Candr + (0x11b60, 0x11b67,), # (nil) (0x11c2f, 0x11c36,), # Bhaiksuki Vowel Sign Aa ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3f,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Virama (0x11c92, 0x11ca7,), # Marchen Subjoined Letter..Marchen Subjoined Letter @@ -339,10 +339,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue - (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au - (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang - (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om + (0x1e6e3, 0x1e6e3,), # (nil) + (0x1e6e6, 0x1e6e6,), # (nil) + (0x1e6ee, 0x1e6ef,), # (nil) + (0x1e6f5, 0x1e6f5,), # (nil) (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0xe0000, 0xe0fff,), # (nil) diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py index 133023d..6687053 100644 --- a/wcwidth/text_sizing.py +++ b/wcwidth/text_sizing.py @@ -76,13 +76,16 @@ class TextSizingParams(typing.NamedTuple): horizontal_align: int = 0 def __repr__(self): + """Return a compact representation including only non-default fields. + + This avoids verbose output when most fields are defaults. + """ # modified to show values only when non-default repr_fmt = ', '.join(f'{field.name}={getattr(self, field.name)}' for field in TEXT_FIELD_MAPPING.values() if getattr(self, field.name) != field.default) return f'{self.__class__.__name__}({repr_fmt})' - def make_sequence(self) -> str: """Build and return sub-part of an OSC 66 sequence.""" parts = [] diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index f0dd75c..e45a1bd 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -355,7 +355,7 @@ def clip( >>> clip('a\tb', 0, 10) # Tab expanded to spaces 'a b' """ - # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements,too-many-nested-blocks + # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements,too-many-nested-blocks,W0101 # Again, for 'hot path', we avoid additional delegate functions and accept the cost # of complexity for improved python performance. start = max(start, 0) @@ -378,10 +378,77 @@ def clip( if propagate_sgr: sgr = _SGR_STATE_DEFAULT - output: list[str] = [] + # output_tokens stores tuples ('vis', text) for visible content and ('seq', seq) + # for preserved zero-width sequences. This allows cursor-left overwrites to + # remove previously emitted visible characters while keeping the sequence order. + # For visible tokens we store ('vis', text, width_in_columns) + # For sequences we store ('seq', seq) + output_tokens: list[tuple[str, ...]] = [] + visible_count = 0 # number of visible columns emitted so far col = 0 idx = 0 + def _append_visible(s: str, w: int, start_col: int | None = None) -> None: + nonlocal visible_count, sgr_at_clip_start + if w <= 0: + return + if start_col is None: + start_col = col + prev = output_tokens[-1] if (output_tokens and output_tokens[-1][0] == 'vis') else None + if prev is not None and prev[3] + prev[2] == start_col: + # merge with previous contiguous visible token: append text and add widths + prev_s = prev[1] + prev_w = prev[2] + prev_start = prev[3] + output_tokens[-1] = ('vis', prev_s + s, prev_w + w, prev_start) + else: + output_tokens.append(('vis', s, w, start_col)) + visible_count += w + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + + def _append_seq(seq: str) -> None: + nonlocal sgr_at_clip_start + output_tokens.append(('seq', seq)) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + + def _remove_visible_tail(n: int) -> None: + """Remove n visible columns from the end of output_tokens (overwrite semantics).""" + nonlocal visible_count + to_remove = n + while to_remove > 0 and visible_count > 0: + # find last visible token + i = len(output_tokens) - 1 + while i >= 0 and output_tokens[i][0] != 'vis': + i -= 1 + if i < 0: + break + tok = output_tokens[i] + tok_s = tok[1] + tok_w = tok[2] + tok_start = tok[3] + if tok_w <= to_remove: + # remove entire token + output_tokens.pop(i) + to_remove -= tok_w + visible_count -= tok_w + else: + # shorten token by removing columns from the end + keep_cols = tok_w - to_remove + # slice the string by grapheme widths + kept_text = '' + acc = 0 + for g in iter_graphemes(tok_s): + gw = width(g, ambiguous_width=ambiguous_width) + if acc + gw > keep_cols: + break + kept_text += g + acc += gw + output_tokens[i] = ('vis', kept_text, acc, tok_start) + visible_count -= to_remove + to_remove = 0 + while idx < len(text): char = text[idx] @@ -396,72 +463,148 @@ def clip( if (propagate_sgr and sgr) and _SGR_PATTERN.match(seq): # Update SGR state; will be applied as prefix when visible content starts sgr = _sgr_state_update(sgr, seq) - else: - # Non-SGR and Non-Text Sizing sequences always preserved - # TODO: what about cursor_left and right! preserved, or padded ?! - ts_match = TEXT_SIZING_PATTERN.match(text, idx) - if ts_match is None: + # we've consumed the sequence; advance index and continue + idx = match.end() + continue + # Non-SGR and Non-Text Sizing sequences always preserved + # TODO: what about cursor_left and right! preserved, or padded ?! + ts_match = TEXT_SIZING_PATTERN.match(text, idx) + if ts_match is None: + # Handle cursor movement sequences specially to simulate visible + # effects (fillchar padding for rightward moves, overwrite for left). + if CURSOR_RIGHT_SEQUENCE.match(seq): + # parse numeric argument (default 1) + try: + n = int(seq.lstrip('\x1b[').rstrip('C')) + except ValueError: + n = 1 + # If movement crosses into the clip window, emit fillchars + move_start = col + move_end = col + n + if move_start < end and move_end > start: + overlap_start = max(move_start, start) + overlap_end = min(move_end, end) + overlap = overlap_end - overlap_start + if overlap > 0: + _append_visible(fillchar * overlap, overlap) + col += n + idx = match.end() + continue + if CURSOR_LEFT_SEQUENCE.match(seq): + try: + n = int(seq.lstrip('\x1b[').rstrip('D')) + except ValueError: + n = 1 + prev_col = col + col = max(0, col - n) + # If we moved left and had emitted visible columns beyond + # the new col, they are now potentially overwritten. + if prev_col > col: + to_remove = min(prev_col - col, visible_count) + if to_remove > 0: + _remove_visible_tail(to_remove) idx = match.end() continue + # Preserve other non-SGR zero-width sequences (OSC hyperlinks, CSI others, etc.) + _append_seq(seq) + idx = match.end() + continue # OSC 66 (text sizing) has positive width text_size = TextSizing.from_match(ts_match, control_codes='parse') ts_width = text_size.display_width(ambiguous_width) if col >= start and col + ts_width <= end: # fits as-is, keep going - output.append(seq) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr + _append_seq(seq) col += ts_width elif col < end and col + ts_width > start: - # TODO: move to TextSizing.clip(start, end) - # TODO: fillchar padding - next_start = max(0, start - col) // text_size.params.scale - visible_width = (min(end, col + ts_width) - max(start, col)) - next_width = (0 if text_size.params.width == 0 - else visible_width // text_size.params.scale) - next_text_size_parms = TextSizingParams( - text_size.params.scale, - next_width, - text_size.params.numerator, - text_size.params.denominator, - text_size.params.vertical_align, - text_size.params.horizontal_align) - - # RECURSION just one time for clip() of inner text. Text sizing - # sequences cannot further contain any sequences. Although tabsize - # is "extended", the modulo margins are not. - next_inner_text = clip( - text_size.text, next_start, next_start + visible_width // text_size.params.scale, - fillchar=fillchar, tabsize=tabsize, ambiguous_width=ambiguous_width, - propagate_sgr=False) - next_text_size = TextSizing( - next_text_size_parms, - next_inner_text, - text_size.terminator) - - delta = next_text_size.display_width() - visible_width - #breakpoint() - if delta > 0: - # left-pad ?? - output.append(delta * fillchar) - if next_inner_text: - output.append(next_text_size.make_sequence()) - if delta < 0: - # or right-pad?? how do we do eeet TODO - output.append(abs(delta) * fillchar) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr + # Clip inside the text-sizing block. Only include whole inner units + # (scaled slots) as sequences. Partial units are represented by + # fillchar characters covering the visible columns. + rel_start = max(0, start - col) + rel_end = min(end, col + ts_width) - col + scale = text_size.params.scale + + # Build unit list: for width>0, units are declared slots (one per width) + # otherwise units are grapheme clusters of inner text. + units: list[tuple[str, int]] = [] + if text_size.params.width > 0: + inner_graphemes = list(iter_graphemes(text_size.text)) + for j in range(text_size.params.width): + g = inner_graphemes[j] if j < len(inner_graphemes) else '' + # declared slots each occupy exactly `scale` columns + units.append((g, scale)) + else: + for g in iter_graphemes(text_size.text): + inner_w = width(g, ambiguous_width=ambiguous_width) + units.append((g, inner_w * scale)) + + pos = 0 + pending_run_texts: list[str] = [] + pending_run_count = 0 + + def emit_pending_run(): + nonlocal pending_run_texts, pending_run_count + if pending_run_count == 0: + return + inner_text = ''.join(pending_run_texts) + if text_size.params.width > 0: + params = TextSizingParams( + scale, + pending_run_count, + text_size.params.numerator, + text_size.params.denominator, + text_size.params.vertical_align, + text_size.params.horizontal_align) + else: + params = TextSizingParams( + scale, + 0, + text_size.params.numerator, + text_size.params.denominator, + text_size.params.vertical_align, + text_size.params.horizontal_align) + ts = TextSizing(params, inner_text, text_size.terminator) + _append_seq(ts.make_sequence()) + pending_run_texts = [] + pending_run_count = 0 + + for unit_text, unit_scaled_w in units: + unit_start = pos + unit_end = pos + unit_scaled_w + if unit_end <= rel_start: + pos = unit_end + continue + if unit_start >= rel_end: + break + overlap = min(unit_end, rel_end) - max(unit_start, rel_start) + + # If overlap covers entire unit, include it in pending run. + if overlap == unit_scaled_w and unit_scaled_w > 0: + pending_run_texts.append(unit_text) + pending_run_count += 1 + else: + # Partial unit or gap: flush pending run and emit fillchars + emit_pending_run() + if overlap > 0: + # absolute start column of this overlap inside the ts block + abs_start = col + max(unit_start, rel_start) + _append_visible(fillchar * overlap, overlap, abs_start) + + pos = unit_end + + # flush remaining run if any + emit_pending_run() + col += ts_width else: col += ts_width idx = ts_match.end() continue - # 2. Handle bare ESC (not a valid sequence) if char == '\x1b': - output.append(char) + _append_seq(char) idx += 1 continue @@ -471,12 +614,11 @@ def clip( next_tab = col + (tabsize - (col % tabsize)) while col < next_tab: if start <= col < end: - output.append(' ') - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr + _append_visible(' ', 1) col += 1 else: - output.append(char) + # preserve tab as-is + _append_seq(char) idx += 1 continue @@ -485,28 +627,65 @@ def clip( grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) if grapheme_w == 0: - # TODO: How is this reachable ?? + # combining/zero-width grapheme; preserve as sequence-like token at this column if start <= col < end: - output.append(grapheme) + _append_seq(grapheme) elif col >= start and col + grapheme_w <= end: # Fully visible - output.append(grapheme) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr + _append_visible(grapheme, grapheme_w) col += grapheme_w elif col < end and col + grapheme_w > start: - # Partially visible (wide char at boundary) - output.append(fillchar * (min(end, col + grapheme_w) - max(start, col))) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr + # Partially visible (wide char at boundary) -> emit fillchars for visible portion + overlap = min(end, col + grapheme_w) - max(start, col) + abs_start = max(start, col) + _append_visible(fillchar * overlap, overlap, abs_start) col += grapheme_w else: - # TODO and this?? col += grapheme_w idx += len(grapheme) - result = ''.join(output) + # Reconstruct result from output_tokens, slicing visible content to [start,end) + parts: list[str] = [] + for tok in output_tokens: + if tok[0] == 'seq': + parts.append(tok[1]) + else: + # visible chunk: ('vis', text, width_in_cols, start_col) + _, text, tok_w, tok_start = tok + chunk_len = tok_w + chunk_start = tok_start + chunk_end = chunk_start + chunk_len + if chunk_end <= start: + continue + if chunk_start >= end: + continue + s0 = max(0, start - chunk_start) + s1 = min(chunk_len, end - chunk_start) + # slice `text` for columns [s0, s1) + acc = 0 + slice_text = '' + for g in iter_graphemes(text): + gw = width(g, ambiguous_width=ambiguous_width) + next_acc = acc + gw + if next_acc <= s0: + acc = next_acc + continue + if acc >= s1: + break + # include this grapheme (or part of it) + # graphemes are atomic; if they partially overlap, use fillchar instead + if acc < s0 or next_acc > s1: + # partial grapheme -> fill with appropriate number of fillchars + left = max(0, s0 - acc) + right = min(gw, s1 - acc) + slice_text += fillchar * (right - left) + else: + slice_text += g + acc = next_acc + parts.append(slice_text) + + result = ''.join(parts) # Apply SGR prefix/suffix if sgr_at_clip_start is not None: From 5aa4265eb959d558a912f0f0cbdfc92d86bf23f1 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Tue, 28 Apr 2026 16:46:58 -0400 Subject: [PATCH 21/70] checkpoint --- tests/test_clip.py | 18 +++++++----- wcwidth/wcwidth.py | 73 ++++++++++++++++++++-------------------------- 2 files changed, 42 insertions(+), 49 deletions(-) diff --git a/tests/test_clip.py b/tests/test_clip.py index 995d383..d4d350a 100644 --- a/tests/test_clip.py +++ b/tests/test_clip.py @@ -114,26 +114,26 @@ def test_clip_sequences_after_end(): # With propagate_sgr=True (default), no style active at start, so no prefix assert clip('hello\x1b[31m world\x1b[0m', 0, 5) == 'hello' # With propagate_sgr=False, all sequences preserved - assert clip('hello\x1b[31m world\x1b[0m', 0, 5, propagate_sgr=False) == 'hello\x1b[31m\x1b[0m' + assert repr(clip('hello\x1b[31m world\x1b[0m', 0, 5, propagate_sgr=False)) == repr('hello\x1b[31m\x1b[0m') def test_clip_sequences_multiple(): # With propagate_sgr=True (default), sequences collapsed to minimal assert clip('\x1b[1m\x1b[31mbold red\x1b[0m', 0, 4) == '\x1b[1;31mbold\x1b[0m' # With propagate_sgr=False, all sequences preserved separately - assert clip('\x1b[1m\x1b[31mbold red\x1b[0m', 0, 4, propagate_sgr=False) == '\x1b[1m\x1b[31mbold\x1b[0m' + assert repr(clip('\x1b[1m\x1b[31mbold red\x1b[0m', 0, 4, propagate_sgr=False)) == repr('\x1b[1m\x1b[31mbold\x1b[0m') def test_clip_sequences_only(): # With propagate_sgr=True (default), no visible text means empty result assert clip('\x1b[31m\x1b[0m', 0, 10) == '' # With propagate_sgr=False, sequences preserved - assert clip('\x1b[31m\x1b[0m', 0, 10, propagate_sgr=False) == '\x1b[31m\x1b[0m' + assert repr(clip('\x1b[31m\x1b[0m', 0, 10, propagate_sgr=False)) == repr('\x1b[31m\x1b[0m') def test_clip_sequences_osc_hyperlink(): - assert clip('\x1b]8;;https://example.com\x07link\x1b]8;;\x07', 0, 4) == \ - '\x1b]8;;https://example.com\x07link\x1b]8;;\x07' + assert repr(clip('\x1b]8;;https://example.com\x07link\x1b]8;;\x07', 0, 4)) == \ + repr('\x1b]8;;https://example.com\x07link\x1b]8;;\x07') def test_clip_sequences_cjk_with_sequences(): @@ -248,13 +248,15 @@ def test_clip_control_chars_zero_width(text, start, end, expected): ('ab\x1b[5Ccd', 0, 4, 'ab\x1b[5Ccd'), ('abcde\x1b[2Df', 0, 6, 'abcde\x1b[2Df'), ('ab\x1b[10Ccd', 0, 4, 'ab\x1b[10Ccd'), - ('ab\x1b[Ccd', 0, 4, 'ab\x1b[Ccd'), + ('XY\x1b[Czy', 0, 4, 'XY\x1b[Cz'), + ('XY\x1b[Czy', 0, 5, 'XY\x1b[Czy'), + ('XY\x1b[Czy', 1, 3, 'XY '), + ('XY\x1b[Czy', 1, 4, 'XY\x1b[C'), ] - @pytest.mark.parametrize('text,start,end,expected', CLIP_CURSOR_SEQUENCE_CASES) def test_clip_cursor_sequences_zero_width(text, start, end, expected): - assert clip(text, start, end) == expected + assert repr(clip(text, start, end)) == repr(expected) def test_clip_tab_first_visible_with_sgr(): diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index e45a1bd..6f9e451 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -466,50 +466,40 @@ def _remove_visible_tail(n: int) -> None: # we've consumed the sequence; advance index and continue idx = match.end() continue - # Non-SGR and Non-Text Sizing sequences always preserved - # TODO: what about cursor_left and right! preserved, or padded ?! - ts_match = TEXT_SIZING_PATTERN.match(text, idx) - if ts_match is None: - # Handle cursor movement sequences specially to simulate visible - # effects (fillchar padding for rightward moves, overwrite for left). - if CURSOR_RIGHT_SEQUENCE.match(seq): - # parse numeric argument (default 1) - try: - n = int(seq.lstrip('\x1b[').rstrip('C')) - except ValueError: - n = 1 - # If movement crosses into the clip window, emit fillchars - move_start = col - move_end = col + n - if move_start < end and move_end > start: - overlap_start = max(move_start, start) - overlap_end = min(move_end, end) - overlap = overlap_end - overlap_start - if overlap > 0: - _append_visible(fillchar * overlap, overlap) - col += n - idx = match.end() - continue - if CURSOR_LEFT_SEQUENCE.match(seq): - try: - n = int(seq.lstrip('\x1b[').rstrip('D')) - except ValueError: - n = 1 - prev_col = col - col = max(0, col - n) - # If we moved left and had emitted visible columns beyond - # the new col, they are now potentially overwritten. - if prev_col > col: - to_remove = min(prev_col - col, visible_count) - if to_remove > 0: - _remove_visible_tail(to_remove) - idx = match.end() - continue - # Preserve other non-SGR zero-width sequences (OSC hyperlinks, CSI others, etc.) - _append_seq(seq) + + # Handle cursor movement sequences specially to simulate visible + # effects (fillchar padding for rightward moves, overwrite for left). + if (match_cleft := CURSOR_RIGHT_SEQUENCE.match(seq)): + # parse numeric argument (default 1) + digit_txt = match_cleft.group(1) + n_left = int(digit_txt) if digit_txt else 1 + # If movement crosses into the clip window, emit fillchars + move_start = col + move_end = col + n_left + if move_start < end and move_end > start: + overlap_start = max(move_start, start) + overlap_end = min(move_end, end) + overlap = overlap_end - overlap_start + if overlap > 0: + _append_visible(fillchar * overlap, overlap) + col += n_left idx = match.end() continue + if (match_cright := CURSOR_LEFT_SEQUENCE.match(seq)): + digit_txt = match_cright.group(1) + n_right = int(digit_txt) if digit_txt else 1 + prev_col = col + col = max(0, col - n_right) + # If we moved left and had emitted visible columns beyond + # the new col, they are now potentially overwritten. + if prev_col > col: + to_remove = min(prev_col - col, visible_count) + if to_remove > 0: + _remove_visible_tail(to_remove) + idx = match.end() + continue + if (ts_match := TEXT_SIZING_PATTERN.match(seq)): # OSC 66 (text sizing) has positive width text_size = TextSizing.from_match(ts_match, control_codes='parse') ts_width = text_size.display_width(ambiguous_width) @@ -598,6 +588,7 @@ def emit_pending_run(): col += ts_width else: + # XXX nothing to clip? TODO breakpoint() and verify col += ts_width idx = ts_match.end() continue From 7193ac79611d156c7cc30e0da152dff232919b3a Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Wed, 29 Apr 2026 14:16:56 -0400 Subject: [PATCH 22/70] much better clip(), also comes with a refactor .. --- .github/workflows/ci.yml | 10 - bin/new-wide-by-version.py | 1 + bin/update-tables.py | 11 +- bin/verify-table-integrity.py | 1 + bin/wcwidth-browser.py | 18 +- bin/wcwidth-libc-comparator.py | 1 + docs/requirements.txt | 17 +- docs/unicode_version.rst | 15 -- requirements-tests38.in | 2 +- requirements-tests38.txt | 5 - requirements-tests39.in | 2 +- requirements-tests39.txt | 7 +- requirements-update.txt | 6 +- tests/conftest.py | 2 + tests/test_ambiguous.py | 1 + tests/test_benchmarks.py | 32 +++ tests/test_clip.py | 21 +- tests/test_clip_cjk_emoji.py | 1 + ...sequences_clip.py => test_clip_cursors.py} | 21 +- tests/test_core.py | 9 +- tests/test_emojis.py | 1 + tests/test_grapheme.py | 1 + tests/test_justify.py | 1 + tests/test_sgr_state.py | 1 + tests/test_text_sizing.py | 1 + tests/test_textwrap.py | 1 + tests/test_ucslevel.py | 1 + tests/test_width.py | 1 + tox.ini | 54 ++--- wcwidth/__init__.py | 1 + wcwidth/_constants.py | 1 + wcwidth/_wcswidth.py | 1 + wcwidth/_wcwidth.py | 1 + wcwidth/bisearch.py | 5 +- wcwidth/escape_sequences.py | 1 + wcwidth/sgr_state.py | 1 + wcwidth/table_ambiguous.py | 2 +- wcwidth/table_grapheme.py | 48 ++--- wcwidth/table_mc.py | 8 +- wcwidth/table_vs16.py | 2 +- wcwidth/table_wide.py | 24 +-- wcwidth/table_zero.py | 18 +- wcwidth/text_sizing.py | 17 +- wcwidth/textwrap.py | 1 + wcwidth/wcwidth.py | 191 +++++++++--------- 45 files changed, 302 insertions(+), 265 deletions(-) rename tests/{test_cursor_sequences_clip.py => test_clip_cursors.py} (62%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a6fd35a..f7430c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,16 +72,6 @@ jobs: - "3.12" - "3.13" - "3.14" - # jquast: pypy disabled 4/28/26 when installing requirements-tests39.txt, "ERROR: Could not - # install packages due to an OSError: ('Connection broken: IncompleteRead(14094 bytes - # read, 554 more expected)', Occur *only* with pypy versions 3.10 and 3.11, and *only* on - # the Windows platform. Microsoft's is an international crime ring designed to keep - # mediocre middle-aged men employed doing barely any work at all, tracking and working - # around bullshit just like this, I'm not paid and I won't be played. - # - "pypy-3.8" - # - "pypy-3.9" - # - "pypy-3.10" - # - "pypy-3.11" runs-on: ${{ matrix.os }} container: ${{ matrix.container }} diff --git a/bin/new-wide-by-version.py b/bin/new-wide-by-version.py index b0ec5ad..3ba85cc 100755 --- a/bin/new-wide-by-version.py +++ b/bin/new-wide-by-version.py @@ -14,6 +14,7 @@ for Unicode version 5.0.0, and were not WIDE values for the previous version (4.1.0). """ + # std imports import sys import json diff --git a/bin/update-tables.py b/bin/update-tables.py index 9383703..959af50 100644 --- a/bin/update-tables.py +++ b/bin/update-tables.py @@ -1,13 +1,14 @@ #!/usr/bin/env python """ -Update the Unicode code tables for wcwidth. This is code generation using jinja2. +Update the Unicode code tables for wcwidth. -This is typically executed through tox, +This is code generation using jinja2. This is typically executed through tox, $ tox -e update https://github.com/jquast/wcwidth """ + from __future__ import annotations # std imports @@ -108,6 +109,7 @@ def _bisearch(ucs, table): @dataclass(order=True, frozen=True) class UnicodeVersion: """A class for comparable unicode version.""" + major: int minor: int micro: int | None @@ -138,6 +140,7 @@ def __str__(self) -> str: @dataclass(frozen=True) class TableEntry: """An entry of a unicode table.""" + code_range: tuple[int, int] | None properties: tuple[str, ...] comment: str @@ -255,6 +258,7 @@ class UnicodeTableRenderCtx(RenderContext): @dataclass class RenderDefinition: """Base class, do not instantiate it directly.""" + jinja_filename: str output_filename: str render_context: RenderContext @@ -330,6 +334,7 @@ def new(cls, filename: str, context: UnicodeTableRenderCtx) -> Self: @dataclass(frozen=True) class GraphemeTableRenderCtx(RenderContext): """Render context for grapheme tables (latest version only).""" + unicode_version: str tables: Mapping[str, TableDef] @@ -880,7 +885,6 @@ def fetch_table_grapheme_data() -> GraphemeTableRenderCtx: tables.update(parse_indic_syllabic_category( UnicodeDataFile.IndicSyllabicCategory(latest_version) )) - return GraphemeTableRenderCtx(str(latest_version), tables) @@ -895,6 +899,7 @@ class UnicodeDataFile: TestEmojiVariationSequences, these files should be forcefully re-fetched CLI argument '--no- check-last-modified'. """ + URL_DERIVED_AGE = 'https://www.unicode.org/Public/UCD/latest/ucd/DerivedAge.txt' URL_EASTASIAN_WIDTH = 'https://www.unicode.org/Public/{version}/ucd/EastAsianWidth.txt' URL_DERIVED_CATEGORY = 'https://www.unicode.org/Public/{version}/ucd/extracted/DerivedGeneralCategory.txt' diff --git a/bin/verify-table-integrity.py b/bin/verify-table-integrity.py index 8a567cd..9a10a3b 100644 --- a/bin/verify-table-integrity.py +++ b/bin/verify-table-integrity.py @@ -65,6 +65,7 @@ +DerivedGeneralCategory-8.0.0.txt:19B0..19C9 ; Lo # [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 """ + # std imports import logging diff --git a/bin/wcwidth-browser.py b/bin/wcwidth-browser.py index 4a439a6..aa033e2 100755 --- a/bin/wcwidth-browser.py +++ b/bin/wcwidth-browser.py @@ -376,6 +376,7 @@ def page_size(self): class Pager: """A less(1)-like browser for browsing unicode characters.""" + # pylint: disable=too-many-instance-attributes #: screen state for next draw method(s). @@ -520,14 +521,13 @@ def run(self, writer, reader): """ Pager entry point. - In interactive mode (terminal is a tty), run until - ``process_keystroke()`` detects quit keystroke ('q'). In - non-interactive mode, exit after displaying all unicode points. + In interactive mode (terminal is a tty), run until ``process_keystroke()`` detects quit + keystroke ('q'). In non-interactive mode, exit after displaying all unicode points. :param writer: callable writes to output stream, receiving unicode. :type writer: callable - :param reader: callable reads keystrokes from input stream, sending - instance of blessed.keyboard.Keystroke. + :param reader: callable reads keystrokes from input stream, sending instance of + blessed.keyboard.Keystroke. :type reader: callable """ self.initialize_page_data() @@ -720,8 +720,8 @@ def draw_heading(self, writer): """ Conditionally redraw screen when ``dirty`` attribute is valued REFRESH. - When Pager attribute ``dirty`` is ``STATE_REFRESH``, cursor is moved - to (0,0), screen is cleared, and heading is displayed. + When Pager attribute ``dirty`` is ``STATE_REFRESH``, cursor is moved to (0,0), screen is + cleared, and heading is displayed. :param callable writer: callable writes to output stream, receiving unicode. :return: True if class attribute ``dirty`` is ``STATE_REFRESH``. @@ -787,8 +787,8 @@ def page_view(self, data): """ Generator yields text to be displayed for the current unicode pageview. - :param list[(unicode, unicode)] data: The current page's data as tuple - of ``(ucs, name)``. + :param list[(unicode, unicode)] data: The current page's data as tuple of ``(ucs, + name)``. :returns: generator for full-page text for display """ if self.term.is_a_tty: diff --git a/bin/wcwidth-libc-comparator.py b/bin/wcwidth-libc-comparator.py index 82e6be4..691c2f1 100755 --- a/bin/wcwidth-libc-comparator.py +++ b/bin/wcwidth-libc-comparator.py @@ -13,6 +13,7 @@ This program accepts one optional command-line argument, the unicode version level for our library to use when comparing to libc. """ + # pylint: disable=C0103 # Invalid module name "wcwidth-libc-comparator" diff --git a/docs/requirements.txt b/docs/requirements.txt index eb4f031..c237614 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,35 +1,34 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.14 # by the following command: # # pip-compile --allow-unsafe --no-emit-index-url --output-file=docs/requirements.txt --strip-extras requirements-docs.in # - alabaster==1.0.0 # via sphinx babel==2.18.0 # via sphinx -certifi==2026.1.4 +certifi==2026.4.22 # via requests -charset-normalizer==3.4.4 +charset-normalizer==3.4.7 # via requests docutils==0.22.4 # via # sphinx # sphinx-rtd-theme -idna==3.11 +idna==3.13 # via requests -imagesize==1.4.1 +imagesize==2.0.0 # via sphinx jinja2==3.1.6 # via sphinx markupsafe==3.0.3 # via jinja2 -packaging==26.0 +packaging==26.2 # via sphinx pygments==2.20.0 # via sphinx -requests==2.33.0 +requests==2.33.1 # via sphinx roman-numerals==4.1.0 # via sphinx @@ -41,7 +40,7 @@ sphinx==9.1.0 # sphinx-autodoc-typehints # sphinx-rtd-theme # sphinxcontrib-jquery -sphinx-autodoc-typehints==3.6.2 +sphinx-autodoc-typehints==3.10.2 # via -r requirements-docs.in sphinx-rtd-theme==3.1.0 # via -r requirements-docs.in diff --git a/docs/unicode_version.rst b/docs/unicode_version.rst index 38ff78d..41a1e52 100644 --- a/docs/unicode_version.rst +++ b/docs/unicode_version.rst @@ -16,21 +16,6 @@ release files: ``emoji-variation-sequences-12.0.0.txt`` *Date: 2019-01-15, 12:10:05 GMT* -``emoji-variation-sequences-13.0.0.txt`` - *Date: 2020-01-21, 07:15:05 GMT* - -``emoji-variation-sequences-14.0.0.txt`` - *Date: 2021-06-08, 05:19:16 GMT* - -``emoji-variation-sequences-15.0.0.txt`` - *Date: 2022-05-13, 21:54:24 GMT* - -``emoji-variation-sequences-15.1.0.txt`` - *Date: 2023-02-01, 02:22:54 GMT* - -``emoji-variation-sequences-16.0.0.txt`` - *Date: 2024-05-01, 21:25:24 GMT* - ``emoji-variation-sequences-17.0.0.txt`` *Date: 2025-01-30, 21:48:29 GMT* diff --git a/requirements-tests38.in b/requirements-tests38.in index fca2238..5158715 100644 --- a/requirements-tests38.in +++ b/requirements-tests38.in @@ -3,4 +3,4 @@ pytest<7 pytest-cov coverage[toml]<6 packaging<26 -pytest-benchmark<5 +# pytest-benchmark<5 diff --git a/requirements-tests38.txt b/requirements-tests38.txt index d043f15..2b8a269 100644 --- a/requirements-tests38.txt +++ b/requirements-tests38.txt @@ -20,15 +20,10 @@ pluggy==1.5.0 # via pytest py==1.11.0 # via pytest -py-cpuinfo==9.0.0 - # via pytest-benchmark pytest==6.2.5 # via # -r requirements-tests38.in - # pytest-benchmark # pytest-cov -pytest-benchmark==4.0.0 - # via -r requirements-tests38.in pytest-cov==5.0.0 # via -r requirements-tests38.in toml==0.10.2 diff --git a/requirements-tests39.in b/requirements-tests39.in index a2167f2..6714226 100644 --- a/requirements-tests39.in +++ b/requirements-tests39.in @@ -6,6 +6,6 @@ importlib-metadata<8.7.1 packaging<26.0 tomli<2.3.0 cffi<2 -pytest-benchmark pygments<2.20 zipp<3.23.1 +# pytest-benchmark diff --git a/requirements-tests39.txt b/requirements-tests39.txt index 1b92757..0fe17a5 100644 --- a/requirements-tests39.txt +++ b/requirements-tests39.txt @@ -30,8 +30,6 @@ pluggy==1.6.0 # via # pytest # pytest-cov -py-cpuinfo==9.0.0 - # via pytest-benchmark pycparser==2.23 # via cffi pygments==2.19.2 @@ -42,12 +40,9 @@ pygments==2.19.2 pytest==8.4.2 # via # -r requirements-tests39.in - # pytest-benchmark # pytest-codspeed # pytest-cov -pytest-benchmark==5.2.3 - # via -r requirements-tests39.in -pytest-codspeed==4.4.0 +pytest-codspeed==4.5.0 # via -r requirements-tests39.in pytest-cov==7.1.0 # via -r requirements-tests39.in diff --git a/requirements-update.txt b/requirements-update.txt index 92830ab..209b3f4 100644 --- a/requirements-update.txt +++ b/requirements-update.txt @@ -1,14 +1,14 @@ # -# This file is autogenerated by pip-compile with Python 3.13 +# This file is autogenerated by pip-compile with Python 3.14 # by the following command: # # pip-compile --allow-unsafe --no-emit-index-url --output-file=requirements-update.txt --strip-extras requirements-update.in # -certifi==2026.2.25 +certifi==2026.4.22 # via requests charset-normalizer==3.4.7 # via requests -idna==3.11 +idna==3.13 # via requests jinja2==3.1.6 # via -r requirements-update.in diff --git a/tests/conftest.py b/tests/conftest.py index 2d0a277..ecbbdc8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ """Pytest configuration and fixtures.""" + # 3rd party import pytest @@ -10,6 +11,7 @@ @pytest.fixture def benchmark(): """No-op benchmark fixture for environments without pytest-codspeed.""" + def _passthrough(func, *args, **kwargs): return func(*args, **kwargs) return _passthrough diff --git a/tests/test_ambiguous.py b/tests/test_ambiguous.py index 0c61cda..20ed4d7 100644 --- a/tests/test_ambiguous.py +++ b/tests/test_ambiguous.py @@ -1,4 +1,5 @@ """Tests for ambiguous_width parameter.""" + # 3rd party import pytest diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index a27653c..f26ced5 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -1,4 +1,5 @@ """Performance benchmarks for wcwidth module.""" + # std imports import os import sys @@ -292,6 +293,36 @@ def test_clip_complex_sgr(benchmark): benchmark(wcwidth.clip, text, 6, 11) +# OSC 66 (kitty text sizing protocol) benchmarks +OSC66_SMALL = '\x1b]66;w=2;XY\x07' +OSC66_SCALED = '\x1b]66;s=3;ABC\x07' +OSC66_LONG = ( + '\x1b]66;w=2;XY\x07' * 5 + + 'interleaved text ' * 5 + + '\x1b]66;s=3;ABC\x07' * 5 +) + + +@pytest.mark.parametrize('label,text', [ + ('small', OSC66_SMALL), + ('scaled', OSC66_SCALED), + ('long', OSC66_LONG), +], ids=lambda v: f'osc66_{v}') +def test_width_osc66(benchmark, label, text): + """Benchmark width() with OSC 66 sequences.""" + benchmark(wcwidth.width, text) + + +@pytest.mark.parametrize('label,text,start,end', [ + ('small', OSC66_SMALL, 0, 2), + ('scaled', OSC66_SCALED, 0, 9), + ('long', OSC66_LONG, 10, 60), +], ids=lambda v: f'osc66_{v}') +def test_clip_osc66(benchmark, label, text, start, end): + """Benchmark clip() with OSC 66 sequences.""" + benchmark(wcwidth.clip, text, start, end) + + def test_propagate_sgr_multiline(benchmark): """Benchmark propagate_sgr() with multiple lines.""" lines = ['\x1b[1;31mline one', 'line two', 'line three\x1b[0m'] @@ -408,6 +439,7 @@ def test_width_udhr_lines(benchmark): @_py38_skip_pedantic def test_width_wcswidth_consistency_udhr(benchmark): """Verify width() and wcswidth() agree for printable multilingual text.""" + def check(): failures = [] for line in UDHR_LINES: diff --git a/tests/test_clip.py b/tests/test_clip.py index d4d350a..70897f8 100644 --- a/tests/test_clip.py +++ b/tests/test_clip.py @@ -1,4 +1,5 @@ """Tests for clip() and strip_sequences() functions.""" + # 3rd party import pytest @@ -132,8 +133,9 @@ def test_clip_sequences_only(): def test_clip_sequences_osc_hyperlink(): - assert repr(clip('\x1b]8;;https://example.com\x07link\x1b]8;;\x07', 0, 4)) == \ - repr('\x1b]8;;https://example.com\x07link\x1b]8;;\x07') + assert repr(clip('\x1b]8;;https://example.com\x07link\x1b]8;;\x07', 0, 4)) == repr( + '\x1b]8;;https://example.com\x07link\x1b]8;;\x07' + ) def test_clip_sequences_cjk_with_sequences(): @@ -244,21 +246,6 @@ def test_clip_control_chars_zero_width(text, start, end, expected): assert clip(text, start, end) == expected -CLIP_CURSOR_SEQUENCE_CASES = [ - ('ab\x1b[5Ccd', 0, 4, 'ab\x1b[5Ccd'), - ('abcde\x1b[2Df', 0, 6, 'abcde\x1b[2Df'), - ('ab\x1b[10Ccd', 0, 4, 'ab\x1b[10Ccd'), - ('XY\x1b[Czy', 0, 4, 'XY\x1b[Cz'), - ('XY\x1b[Czy', 0, 5, 'XY\x1b[Czy'), - ('XY\x1b[Czy', 1, 3, 'XY '), - ('XY\x1b[Czy', 1, 4, 'XY\x1b[C'), -] - -@pytest.mark.parametrize('text,start,end,expected', CLIP_CURSOR_SEQUENCE_CASES) -def test_clip_cursor_sequences_zero_width(text, start, end, expected): - assert repr(clip(text, start, end)) == repr(expected) - - def test_clip_tab_first_visible_with_sgr(): """Tab as first visible character with SGR propagation.""" assert clip('\x1b[31m\tb', 0, 4, tabsize=8) == '\x1b[31m \x1b[0m' diff --git a/tests/test_clip_cjk_emoji.py b/tests/test_clip_cjk_emoji.py index 64795a1..e41bd62 100644 --- a/tests/test_clip_cjk_emoji.py +++ b/tests/test_clip_cjk_emoji.py @@ -5,6 +5,7 @@ - Partial columns of a wide grapheme are replaced by fillchar. - Full grapheme included when fully inside slice. """ + # 3rd party import pytest diff --git a/tests/test_cursor_sequences_clip.py b/tests/test_clip_cursors.py similarity index 62% rename from tests/test_cursor_sequences_clip.py rename to tests/test_clip_cursors.py index d424715..93bd743 100644 --- a/tests/test_cursor_sequences_clip.py +++ b/tests/test_clip_cursors.py @@ -4,6 +4,7 @@ These tests codify expected visible results when cursor movement sequences affect horizontal positions. They are intentionally specific and will drive future implementation changes in clip(). """ + # 3rd party import pytest @@ -18,6 +19,24 @@ ("hello\x1b[10Cworld", 0, 5, "hello"), # Cursor-left overwrites previous characters ("hello\x1b[2DXY", 0, 5, "helXY"), + ('ab\x1b[5Ccd', 0, 4, 'ab '), + ('abcde\x1b[2Df', 0, 6, 'abcf'), + ('ab\x1b[10Ccd', 0, 4, 'ab '), + ('XY\x1b[Czy', 0, 4, 'XY z'), + ('XY\x1b[Czy', 0, 5, 'XY zy'), + ('XY\x1b[Czy', 1, 3, 'Y '), + ('XY\x1b[Czy', 1, 4, 'Y z'), + ('LOL\x1b[5Clol', 0, 12, 'LOL lol'), + ('LOL\x1b[5Clol', 1, 11, 'OL lol'), + ('LOL\x1b[5Clol', 2, 11, 'L lol'), + ('LOL\x1b[5Clol', 3, 11, ' lol'), + ('LOL\x1b[5Clol', 4, 11, ' lol'), + ('LOL\x1b[5Clol', 5, 11, ' lol'), + ('LOL\x1b[5Clol', 6, 11, ' lol'), + ('LOL\x1b[5Clol', 7, 11, ' lol'), + ('LOL\x1b[5Clol', 8, 11, 'lol'), + ('LOL\x1b[5Clol', 9, 11, 'ol'), + ]) def test_clip_cursor_sequences_expected_behaviour(text, start, end, expected): """ @@ -27,4 +46,4 @@ def test_clip_cursor_sequences_expected_behaviour(text, start, end, expected): the clipped output if the moved-to columns are within the clip window; cursor-left allows subsequent characters to overwrite previous content and the clip should reflect that. """ - assert clip(text, start, end) == expected + assert repr(clip(text, start, end)) == repr(expected) \ No newline at end of file diff --git a/tests/test_core.py b/tests/test_core.py index cf71d31..01c9fb5 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,4 +1,5 @@ """Core tests for wcwidth module.""" + # std imports import sys import importlib.metadata @@ -69,9 +70,8 @@ def test_hello_jp(): """ Width of Japanese phrase: コンニチハ, セカイ! - Given a phrase of 5 and 3 Katakana ideographs, joined with - 3 English-ASCII punctuation characters, totaling 11, this - phrase consumes 19 cells of a terminal emulator. + Given a phrase of 5 and 3 Katakana ideographs, joined with 3 English-ASCII punctuation + characters, totaling 11, this phrase consumes 19 cells of a terminal emulator. """ # given, phrase = 'コンニチハ, セカイ!' @@ -91,8 +91,7 @@ def test_wcswidth_substr(): """ Test wcswidth() optional 2nd parameter, ``n``. - ``n`` determines at which position of the string - to stop counting length. + ``n`` determines at which position of the string to stop counting length. """ # given, phrase = 'コンニチハ, セカイ!' diff --git a/tests/test_emojis.py b/tests/test_emojis.py index 9a962cc..f49c9ef 100644 --- a/tests/test_emojis.py +++ b/tests/test_emojis.py @@ -1,4 +1,5 @@ """Tests for emoji width measurement and ZWJ sequences.""" + # std imports import os diff --git a/tests/test_grapheme.py b/tests/test_grapheme.py index fb98723..8d139b9 100644 --- a/tests/test_grapheme.py +++ b/tests/test_grapheme.py @@ -1,4 +1,5 @@ """Tests for grapheme cluster segmentation.""" + # std imports import os diff --git a/tests/test_justify.py b/tests/test_justify.py index 71dec61..f2639e8 100644 --- a/tests/test_justify.py +++ b/tests/test_justify.py @@ -1,4 +1,5 @@ """Tests for text justification functions.""" + # local from wcwidth import ljust, rjust, width, center diff --git a/tests/test_sgr_state.py b/tests/test_sgr_state.py index db9c8a9..ecba402 100644 --- a/tests/test_sgr_state.py +++ b/tests/test_sgr_state.py @@ -1,4 +1,5 @@ """Tests for SGR state tracking and propagation.""" + from __future__ import annotations # std imports diff --git a/tests/test_text_sizing.py b/tests/test_text_sizing.py index cd3421a..86757e0 100644 --- a/tests/test_text_sizing.py +++ b/tests/test_text_sizing.py @@ -1,4 +1,5 @@ """Tests for Text Sizing Protocol (OSC 66) support.""" + # 3rd party import pytest diff --git a/tests/test_textwrap.py b/tests/test_textwrap.py index 0144956..ae2eb2d 100644 --- a/tests/test_textwrap.py +++ b/tests/test_textwrap.py @@ -1,4 +1,5 @@ """Tests for sequence-aware text wrapping functions.""" + # std imports import sys import platform diff --git a/tests/test_ucslevel.py b/tests/test_ucslevel.py index 979cfe0..9aea2c9 100644 --- a/tests/test_ucslevel.py +++ b/tests/test_ucslevel.py @@ -1,4 +1,5 @@ """Unicode version level tests for wcwidth.""" + # local import wcwidth diff --git a/tests/test_width.py b/tests/test_width.py index 67d7b01..39b49a1 100644 --- a/tests/test_width.py +++ b/tests/test_width.py @@ -1,4 +1,5 @@ """Tests for width() function.""" + # 3rd party import pytest diff --git a/tox.ini b/tox.ini index a921a79..b2150cf 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = update, fetch, compile, autopep8, docformatter, docformatter_check, isort, isort_check, pylint, pylint_tests, flake8, flake8_tests, pydocstyle, mypy, codespell, format, lint, docs, verify_tables, py{38, 39, 310, 311, 312, 313, 314}, pypy{38, 39, 310, 311} +envlist = update, fetch, compile, autopep8, docformatter, docformatter_check, isort, isort_check, pylint, pylint_tests, flake8, flake8_tests, pydocstyle, mypy, codespell, format, lint, docs, verify_tables, py{38, 39, 310, 311, 312, 313, 314} skip_missing_interpreters = true [base] @@ -10,7 +10,7 @@ deps = -r requirements-tests39.txt commands = {envpython} -m pytest --cov-config={toxinidir}/tox.ini {posargs:\ --verbose \ --junit-xml=.tox/results.{envname}.xml \ - --durations=3 \ + --durations=10 \ } \ --log-format='%(levelname)s %(relativeCreated)2.2f %(filename)s:%(lineno)d %(message)s' \ tests @@ -61,22 +61,21 @@ precision = 1 [coverage:paths] source = wcwidth/ -# wcwidth itself has no 3rd party dependencies, but to ensure the best available -# version for the newest to oldest python versions for testing, must also use some -# targeted versions to 'compile' those requirements into their frozen form, -# otherwise incompatible packages would be pinned. At the time of this writing the -# files compiled for version 3.9 and later are compiled by python3.13 [WIP]. [testenv:compile] -basepython = python3.13 +basepython = python3.14 commands = python -m compileall {toxinidir}/wcwidth {toxinidir}/bin {toxinidir}/tests {toxinidir}/docs +# wcwidth itself has no 3rd party dependencies, but to ensure the best available +# version for the newest to oldest python versions for testing, must also use some +# targeted versions to 'compile' those requirements into their frozen form, +# otherwise incompatible packages can be pinned. [testenv:update_requirements_update] -basepython = python3.13 +basepython = python3.14 deps = pip-tools commands = {[base]pip_compile_command} requirements-update.in -o requirements-update.txt [testenv:update_requirements_docs] -basepython = python3.12 +basepython = python3.14 deps = pip-tools commands = {[base]pip_compile_command} requirements-docs.in -o docs/requirements.txt @@ -93,11 +92,9 @@ commands = {[base]pip_compile_command} requirements-tests38.in -o requirements-t [testenv:py38] deps = -r requirements-tests38.txt -[testenv:pypy38] -deps = -r requirements-tests38.txt - [testenv:update] -basepython = python3.14 +# it is best to use latest python for latest 'unicodedata' for named table items in generated code +basepython = python3.15 usedevelop = true deps = -r requirements-update.txt commands = python {toxinidir}/bin/update-tables.py {posargs:--fetch-all-versions} @@ -139,7 +136,7 @@ commands = flake8 --exclude=tests docs/ wcwidth/ bin/ tests/ [testenv:docs] # matches .readthedocs.yaml and environment -basepython = python3.12 +basepython = python3.14 deps = -r {toxinidir}/docs/requirements.txt commands = sphinx-build -W docs/ build/sphinx @@ -170,29 +167,29 @@ commands = pydocstyle --source --explain {toxinidir}/wcwidth basepython = python3.13 deps = docformatter>=1.7.7 untokenize -commands = - docformatter --in-place --recursive --pre-summary-newline \ +commands = docformatter --in-place --recursive --pre-summary-newline \ --wrap-summaries=100 --wrap-descriptions=100 \ {toxinidir}/wcwidth/ {toxinidir}/bin {toxinidir}/tests/ [testenv:docformatter_check] -basepython = python3.13 +basepython = python3.14 deps = {[testenv:docformatter]deps} commands = docformatter --check --diff --recursive --pre-summary-newline \ --wrap-summaries=100 --wrap-descriptions=100 \ {toxinidir}/wcwidth/ {toxinidir}/bin {toxinidir}/tests/ [testenv:isort_check] -basepython = python3.13 +basepython = python3.14 deps = isort commands = isort --diff --check-only wcwidth tests bin [testenv:flake8_tests] -basepython = python3.13 +basepython = python3.14 deps = flake8 commands = flake8 --ignore=E501,W504,F401 tests/ [testenv:pylint_tests] -basepython = python3.13 +basepython = python3.14 deps = pytest pylint commands = pylint --rcfile={toxinidir}/.pylintrc \ @@ -210,7 +207,7 @@ warn_redundant_casts = true warn_unused_ignores = true [testenv:codespell] -basepython = python3.13 +basepython = python3.14 deps = codespell commands = codespell --skip="*.pyc,htmlcov,_build,build,*.egg-info,.tox,data,./tests/*.txt,*.csv,*.ods,table_*.py,docs/specs.rst,*.isorted" \ --ignore-words-list="thirdparty,claus,oclock,womens,aprox" \ @@ -218,7 +215,7 @@ commands = codespell --skip="*.pyc,htmlcov,_build,build,*.egg-info,.tox,data,./t --summary --count [testenv:format] -basepython = python3.13 +basepython = python3.14 deps = {[testenv:isort]deps} {[testenv:docformatter]deps} {[testenv:autopep8]deps} @@ -227,16 +224,19 @@ commands = {[testenv:isort]commands} {[testenv:autopep8]commands} [testenv:lint] -basepython = python3.13 +basepython = python3.14 deps = {[testenv:flake8]deps} + {[testenv:mypy]deps} {[testenv:isort_check]deps} {[testenv:pydocstyle]deps} {[testenv:pylint_tests]deps} {[testenv:codespell]deps} -commands = {[testenv:flake8]commands} - {[testenv:flake8_tests]commands} - {[testenv:isort_check]commands} - {[testenv:pydocstyle]commands} +commands = {[testenv:compile]commands} + {[testenv:flake8]commands} + {[testenv:mypy]commands} {[testenv:pylint]commands} + {[testenv:flake8_tests]commands} {[testenv:pylint_tests]commands} {[testenv:codespell]commands} + {[testenv:pydocstyle]commands} + {[testenv:isort_check]commands} diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index b4b4494..2205d77 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -3,6 +3,7 @@ https://github.com/jquast/wcwidth """ + # re-export all functions & definitions, even private ones, from top-level # module path, to allow for 'from wcwidth import _private_func'. Of course, # user beware that any _private functions or variables not exported by __all__ diff --git a/wcwidth/_constants.py b/wcwidth/_constants.py index 2b4a87b..5505ef5 100644 --- a/wcwidth/_constants.py +++ b/wcwidth/_constants.py @@ -1,4 +1,5 @@ """Shared data tables and constants for wcwidth.py, _wcwidth.py, and _wcswidth.py.""" + # local from .table_mc import CATEGORY_MC from .table_wide import WIDE_EASTASIAN diff --git a/wcwidth/_wcswidth.py b/wcwidth/_wcswidth.py index 91d2285..423a6af 100644 --- a/wcwidth/_wcswidth.py +++ b/wcwidth/_wcswidth.py @@ -1,4 +1,5 @@ """This is a python implementation of wcswidth().""" + import typing # local diff --git a/wcwidth/_wcwidth.py b/wcwidth/_wcwidth.py index 0403b32..9e72418 100644 --- a/wcwidth/_wcwidth.py +++ b/wcwidth/_wcwidth.py @@ -1,4 +1,5 @@ """This is a python implementation of wcwidth().""" + # std # std imports from functools import lru_cache diff --git a/wcwidth/bisearch.py b/wcwidth/bisearch.py index becfe86..e95c51b 100644 --- a/wcwidth/bisearch.py +++ b/wcwidth/bisearch.py @@ -1,4 +1,5 @@ """Binary search function for Unicode interval tables.""" + from __future__ import annotations @@ -7,8 +8,8 @@ def bisearch(ucs: int, table: tuple[tuple[int, int], ...]) -> int: Binary search in interval table. :param ucs: Ordinal value of unicode character. - :param table: Tuple of starting and ending ranges of ordinal values, - in form of ``((start, end), ...)``. + :param table: Tuple of starting and ending ranges of ordinal values, in form of ``((start, end), + ...)``. :returns: 1 if ordinal value ucs is found within lookup table, else 0. """ lbound = 0 diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index 60a6b7a..e9d5734 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -5,6 +5,7 @@ sequences that begin with ESC (``\x1b``). Before calling re.match with these patterns, callers should first check that the character at the current position is ESC for optimal performance. """ + # std imports import re diff --git a/wcwidth/sgr_state.py b/wcwidth/sgr_state.py index b0c8648..5d6e988 100644 --- a/wcwidth/sgr_state.py +++ b/wcwidth/sgr_state.py @@ -5,6 +5,7 @@ etc.) via public API propagate_sgr(), and its dependent functions, cut() and wrap(). It only has attributes necessary to perform its functions, eg 'RED' and 'BLUE' attributes are not defined. """ + from __future__ import annotations # std imports diff --git a/wcwidth/table_ambiguous.py b/wcwidth/table_ambiguous.py index e3dc0b1..87403b7 100644 --- a/wcwidth/table_ambiguous.py +++ b/wcwidth/table_ambiguous.py @@ -1,7 +1,7 @@ """ Exports AMBIGUOUS_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-18 23:27:15 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. """ # pylint: disable=duplicate-code AMBIGUOUS_EASTASIAN = { diff --git a/wcwidth/table_grapheme.py b/wcwidth/table_grapheme.py index b9cfdc4..ebdb1e2 100644 --- a/wcwidth/table_grapheme.py +++ b/wcwidth/table_grapheme.py @@ -4,7 +4,7 @@ This module provides lookup tables for Unicode grapheme cluster break properties as defined in UAX #29: Unicode Text Segmentation. -This code generated by wcwidth/bin/update-tables.py on 2026-04-28 19:35:59 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ # pylint: disable=duplicate-code @@ -202,8 +202,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) - (0x01ae0, 0x01aeb,), # (nil) + (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B + (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b3d,), # Balinese Sign Rerekan ..Balinese Vowel Sign La L (0x01b42, 0x01b44,), # Balinese Vowel Sign Pepe..Balinese Adeg Adeg @@ -284,7 +284,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11001, 0x11001,), # Brahmi Sign Anusvara @@ -367,9 +367,9 @@ (0x11a59, 0x11a5b,), # Soyombo Vowel Sign Vocal..Soyombo Vowel Length Mar (0x11a8a, 0x11a96,), # Soyombo Final Consonant ..Soyombo Sign Anusvara (0x11a98, 0x11a99,), # Soyombo Gemination Mark ..Soyombo Subjoiner - (0x11b60, 0x11b60,), # (nil) - (0x11b62, 0x11b64,), # (nil) - (0x11b66, 0x11b66,), # (nil) + (0x11b60, 0x11b60,), # Sharada Vowel Sign Oe + (0x11b62, 0x11b64,), # Sharada Vowel Sign Ue ..Sharada Vowel Sign Short + (0x11b66, 0x11b66,), # Sharada Vowel Sign Candra E (0x11c30, 0x11c36,), # Bhaiksuki Vowel Sign I ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3d,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Anusvara (0x11c3f, 0x11c3f,), # Bhaiksuki Sign Virama @@ -426,10 +426,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # (nil) - (0x1e6e6, 0x1e6e6,), # (nil) - (0x1e6ee, 0x1e6ef,), # (nil) - (0x1e6f5, 0x1e6f5,), # (nil) + (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue + (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au + (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang + (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri @@ -617,9 +617,9 @@ (0x11a39, 0x11a39,), # Zanabazar Square Sign Visarga (0x11a57, 0x11a58,), # Soyombo Vowel Sign Ai ..Soyombo Vowel Sign Au (0x11a97, 0x11a97,), # Soyombo Sign Visarga - (0x11b61, 0x11b61,), # (nil) - (0x11b65, 0x11b65,), # (nil) - (0x11b67, 0x11b67,), # (nil) + (0x11b61, 0x11b61,), # Sharada Vowel Sign Ooe + (0x11b65, 0x11b65,), # Sharada Vowel Sign Short O + (0x11b67, 0x11b67,), # Sharada Vowel Sign Candra O (0x11c2f, 0x11c2f,), # Bhaiksuki Vowel Sign Aa (0x11c3e, 0x11c3e,), # Bhaiksuki Sign Visarga (0x11ca9, 0x11ca9,), # Marchen Subjoined Letter Ya @@ -1892,8 +1892,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) - (0x01ae0, 0x01aeb,), # (nil) + (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B + (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b3d,), # Balinese Sign Rerekan ..Balinese Vowel Sign La L (0x01b42, 0x01b43,), # Balinese Vowel Sign Pepe..Balinese Vowel Sign Pepe @@ -1972,7 +1972,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11001, 0x11001,), # Brahmi Sign Anusvara @@ -2055,9 +2055,9 @@ (0x11a59, 0x11a5b,), # Soyombo Vowel Sign Vocal..Soyombo Vowel Length Mar (0x11a8a, 0x11a96,), # Soyombo Final Consonant ..Soyombo Sign Anusvara (0x11a98, 0x11a98,), # Soyombo Gemination Mark - (0x11b60, 0x11b60,), # (nil) - (0x11b62, 0x11b64,), # (nil) - (0x11b66, 0x11b66,), # (nil) + (0x11b60, 0x11b60,), # Sharada Vowel Sign Oe + (0x11b62, 0x11b64,), # Sharada Vowel Sign Ue ..Sharada Vowel Sign Short + (0x11b66, 0x11b66,), # Sharada Vowel Sign Candra E (0x11c30, 0x11c36,), # Bhaiksuki Vowel Sign I ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3d,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Anusvara (0x11c3f, 0x11c3f,), # Bhaiksuki Sign Virama @@ -2114,10 +2114,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # (nil) - (0x1e6e6, 0x1e6e6,), # (nil) - (0x1e6ee, 0x1e6ef,), # (nil) - (0x1e6f5, 0x1e6f5,), # (nil) + (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue + (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au + (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang + (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri diff --git a/wcwidth/table_mc.py b/wcwidth/table_mc.py index e0ee591..557dc95 100644 --- a/wcwidth/table_mc.py +++ b/wcwidth/table_mc.py @@ -1,7 +1,7 @@ """ Exports CATEGORY_MC table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-28 19:35:59 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ # pylint: disable=duplicate-code CATEGORY_MC = { @@ -181,9 +181,9 @@ (0x11a39, 0x11a39,), # Zanabazar Square Sign Visarga (0x11a57, 0x11a58,), # Soyombo Vowel Sign Ai ..Soyombo Vowel Sign Au (0x11a97, 0x11a97,), # Soyombo Sign Visarga - (0x11b61, 0x11b61,), # (nil) - (0x11b65, 0x11b65,), # (nil) - (0x11b67, 0x11b67,), # (nil) + (0x11b61, 0x11b61,), # Sharada Vowel Sign Ooe + (0x11b65, 0x11b65,), # Sharada Vowel Sign Short O + (0x11b67, 0x11b67,), # Sharada Vowel Sign Candra O (0x11c2f, 0x11c2f,), # Bhaiksuki Vowel Sign Aa (0x11c3e, 0x11c3e,), # Bhaiksuki Sign Visarga (0x11ca9, 0x11ca9,), # Marchen Subjoined Letter Ya diff --git a/wcwidth/table_vs16.py b/wcwidth/table_vs16.py index 70e4a73..49abce8 100644 --- a/wcwidth/table_vs16.py +++ b/wcwidth/table_vs16.py @@ -1,7 +1,7 @@ """ Exports VS16_NARROW_TO_WIDE table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2025-09-15 16:57:50 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. """ # pylint: disable=duplicate-code VS16_NARROW_TO_WIDE = { diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py index 898734c..692868a 100644 --- a/wcwidth/table_wide.py +++ b/wcwidth/table_wide.py @@ -1,7 +1,7 @@ """ Exports WIDE_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-28 19:35:59 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ # pylint: disable=duplicate-code WIDE_EASTASIAN = { @@ -71,10 +71,10 @@ (0x0ff01, 0x0ff60,), # Fullwidth Exclamation Ma..Fullwidth Right White Pa (0x0ffe0, 0x0ffe6,), # Fullwidth Cent Sign ..Fullwidth Won Sign (0x16fe0, 0x16fe3,), # Tangut Iteration Mark ..Old Chinese Iteration Ma - (0x16ff2, 0x16ff6,), # (nil) - (0x17000, 0x18cd5,), # (nil) ..Khitan Small Script Char - (0x18cff, 0x18d1e,), # Khitan Small Script Char..(nil) - (0x18d80, 0x18df2,), # (nil) + (0x16ff2, 0x16ff6,), # Chinese Small Simplified..Yangqin Sign Slow Two Be + (0x17000, 0x18cd5,), # Tangut Ideograph-17000 ..Khitan Small Script Char + (0x18cff, 0x18d1e,), # Khitan Small Script Char..Tangut Ideograph-18d1e + (0x18d80, 0x18df2,), # Tangut Component-769 ..Tangut Component-883 (0x1aff0, 0x1aff3,), # Katakana Letter Minnan T..Katakana Letter Minnan T (0x1aff5, 0x1affb,), # Katakana Letter Minnan T..Katakana Letter Minnan N (0x1affd, 0x1affe,), # Katakana Letter Minnan N..Katakana Letter Minnan N @@ -116,7 +116,7 @@ (0x1f680, 0x1f6c5,), # Rocket ..Left Luggage (0x1f6cc, 0x1f6cc,), # Sleeping Accommodation (0x1f6d0, 0x1f6d2,), # Place Of Worship ..Shopping Trolley - (0x1f6d5, 0x1f6d8,), # Hindu Temple ..(nil) + (0x1f6d5, 0x1f6d8,), # Hindu Temple ..Landslide (0x1f6dc, 0x1f6df,), # Wireless ..Ring Buoy (0x1f6eb, 0x1f6ec,), # Airplane Departure ..Airplane Arriving (0x1f6f4, 0x1f6fc,), # Scooter ..Roller Skate @@ -126,12 +126,12 @@ (0x1f93c, 0x1f945,), # Wrestlers ..Goal Net (0x1f947, 0x1f9ff,), # First Place Medal ..Nazar Amulet (0x1fa70, 0x1fa7c,), # Ballet Shoes ..Crutch - (0x1fa80, 0x1fa8a,), # Yo-yo ..(nil) - (0x1fa8e, 0x1fac6,), # (nil) ..Fingerprint - (0x1fac8, 0x1fac8,), # (nil) - (0x1facd, 0x1fadc,), # (nil) ..Root Vegetable - (0x1fadf, 0x1faea,), # Splatter ..(nil) - (0x1faef, 0x1faf8,), # (nil) ..Rightwards Pushing Hand + (0x1fa80, 0x1fa8a,), # Yo-yo ..Trombone + (0x1fa8e, 0x1fac6,), # Treasure Chest ..Fingerprint + (0x1fac8, 0x1fac8,), # Hairy Creature + (0x1facd, 0x1fadc,), # Orca ..Root Vegetable + (0x1fadf, 0x1faea,), # Splatter ..Distorted Face + (0x1faef, 0x1faf8,), # Fight Cloud ..Rightwards Pushing Hand (0x20000, 0x2fffd,), # Cjk Unified Ideograph-20..(nil) (0x30000, 0x3fffd,), # Cjk Unified Ideograph-30..(nil) ), diff --git a/wcwidth/table_zero.py b/wcwidth/table_zero.py index 0e8108d..133103c 100644 --- a/wcwidth/table_zero.py +++ b/wcwidth/table_zero.py @@ -1,7 +1,7 @@ """ Exports ZERO_WIDTH table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-28 19:35:59 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ # pylint: disable=duplicate-code ZERO_WIDTH = { @@ -147,8 +147,8 @@ (0x01a55, 0x01a5e,), # Tai Tham Consonant Sign ..Tai Tham Consonant Sign (0x01a60, 0x01a7c,), # Tai Tham Sign Sakot ..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01add,), # Combining Doubled Circum..(nil) - (0x01ae0, 0x01aeb,), # (nil) + (0x01ab0, 0x01add,), # Combining Doubled Circum..Combining Dot-and-ring B + (0x01ae0, 0x01aeb,), # Combining Left Tack Abov..Combining Double Rightwa (0x01b00, 0x01b04,), # Balinese Sign Ulu Ricem ..Balinese Sign Bisah (0x01b34, 0x01b44,), # Balinese Sign Rerekan ..Balinese Adeg Adeg (0x01b6b, 0x01b73,), # Balinese Musical Symbol ..Balinese Musical Symbol @@ -222,7 +222,7 @@ (0x10d24, 0x10d27,), # Hanifi Rohingya Sign Har..Hanifi Rohingya Sign Tas (0x10d69, 0x10d6d,), # Garay Vowel Sign E ..Garay Consonant Nasaliza (0x10eab, 0x10eac,), # Yezidi Combining Hamza M..Yezidi Combining Madda M - (0x10efa, 0x10eff,), # (nil) ..Arabic Small Low Word Ma + (0x10efa, 0x10eff,), # Arabic Double Vertical B..Arabic Small Low Word Ma (0x10f46, 0x10f50,), # Sogdian Combining Dot Be..Sogdian Combining Stroke (0x10f82, 0x10f85,), # Old Uyghur Combining Dot..Old Uyghur Combining Two (0x11000, 0x11002,), # Brahmi Sign Candrabindu ..Brahmi Sign Visarga @@ -284,7 +284,7 @@ (0x11a47, 0x11a47,), # Zanabazar Square Subjoiner (0x11a51, 0x11a5b,), # Soyombo Vowel Sign I ..Soyombo Vowel Length Mar (0x11a8a, 0x11a99,), # Soyombo Final Consonant ..Soyombo Subjoiner - (0x11b60, 0x11b67,), # (nil) + (0x11b60, 0x11b67,), # Sharada Vowel Sign Oe ..Sharada Vowel Sign Candr (0x11c2f, 0x11c36,), # Bhaiksuki Vowel Sign Aa ..Bhaiksuki Vowel Sign Voc (0x11c38, 0x11c3f,), # Bhaiksuki Vowel Sign E ..Bhaiksuki Sign Virama (0x11c92, 0x11ca7,), # Marchen Subjoined Letter..Marchen Subjoined Letter @@ -339,10 +339,10 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e5ee, 0x1e5ef,), # Ol Onal Sign Mu ..Ol Onal Sign Ikir - (0x1e6e3, 0x1e6e3,), # (nil) - (0x1e6e6, 0x1e6e6,), # (nil) - (0x1e6ee, 0x1e6ef,), # (nil) - (0x1e6f5, 0x1e6f5,), # (nil) + (0x1e6e3, 0x1e6e3,), # Tai Yo Sign Ue + (0x1e6e6, 0x1e6e6,), # Tai Yo Sign Au + (0x1e6ee, 0x1e6ef,), # Tai Yo Sign Ay ..Tai Yo Sign Ang + (0x1e6f5, 0x1e6f5,), # Tai Yo Sign Om (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta (0xe0000, 0xe0fff,), # (nil) diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py index 6687053..4b495e9 100644 --- a/wcwidth/text_sizing.py +++ b/wcwidth/text_sizing.py @@ -27,6 +27,7 @@ .. versionadded:: 0.6.1 """ + from __future__ import annotations # std imports @@ -58,10 +59,9 @@ class TextSizingParams(typing.NamedTuple): """ Parsed parameters from a text sizing escape sequence (OSC 66). - :param scale: Scale factor (1-7). Text occupies ``scale`` rows tall - and ``scale * width`` columns wide. - :param width: Width in cells (0-7). When 0, width is auto-calculated - from the inner text. + :param scale: Scale factor (1-7). Text occupies ``scale`` rows tall and ``scale * width`` + columns wide. + :param width: Width in cells (0-7). When 0, width is auto-calculated from the inner text. :param numerator: Fractional scaling numerator (0-15). :param denominator: Fractional scaling denominator (0-15). :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). @@ -76,7 +76,8 @@ class TextSizingParams(typing.NamedTuple): horizontal_align: int = 0 def __repr__(self): - """Return a compact representation including only non-default fields. + """ + Return a compact representation including only non-default fields. This avoids verbose output when most fields are defaults. """ @@ -182,10 +183,8 @@ def display_width(self, ambiguous_width: int = 1) -> int: Calculate the display width of a text sizing sequence. :param ambiguous_width: Width for East Asian Ambiguous characters. - :returns: Display width in terminal cells. - - When ``width > 0``, returns ``params.scale * params.width``. - When ``width == 0``, returns ``params.scale * measured_inner_width``. + :returns: Display width in terminal cells. When ``width > 0``, returns ``params.scale * + params.width``. When ``width == 0``, returns ``params.scale * measured_inner_width``. """ if self.params.width > 0: return self.params.scale * self.params.width diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index cc3a3d7..f56b4a2 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -4,6 +4,7 @@ This module provides functions for wrapping text that may contain terminal escape sequences, with proper handling of Unicode grapheme clusters and character display widths. """ + from __future__ import annotations # std imports diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 6f9e451..749ea58 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -481,7 +481,7 @@ def _remove_visible_tail(n: int) -> None: overlap_end = min(move_end, end) overlap = overlap_end - overlap_start if overlap > 0: - _append_visible(fillchar * overlap, overlap) + _append_visible(fillchar * overlap, overlap, overlap_start) col += n_left idx = match.end() continue @@ -501,98 +501,23 @@ def _remove_visible_tail(n: int) -> None: continue if (ts_match := TEXT_SIZING_PATTERN.match(seq)): # OSC 66 (text sizing) has positive width - text_size = TextSizing.from_match(ts_match, control_codes='parse') - ts_width = text_size.display_width(ambiguous_width) - if col >= start and col + ts_width <= end: - # fits as-is, keep going - _append_seq(seq) - col += ts_width - elif col < end and col + ts_width > start: - # Clip inside the text-sizing block. Only include whole inner units - # (scaled slots) as sequences. Partial units are represented by - # fillchar characters covering the visible columns. - rel_start = max(0, start - col) - rel_end = min(end, col + ts_width) - col - scale = text_size.params.scale - - # Build unit list: for width>0, units are declared slots (one per width) - # otherwise units are grapheme clusters of inner text. - units: list[tuple[str, int]] = [] - if text_size.params.width > 0: - inner_graphemes = list(iter_graphemes(text_size.text)) - for j in range(text_size.params.width): - g = inner_graphemes[j] if j < len(inner_graphemes) else '' - # declared slots each occupy exactly `scale` columns - units.append((g, scale)) - else: - for g in iter_graphemes(text_size.text): - inner_w = width(g, ambiguous_width=ambiguous_width) - units.append((g, inner_w * scale)) - - pos = 0 - pending_run_texts: list[str] = [] - pending_run_count = 0 - - def emit_pending_run(): - nonlocal pending_run_texts, pending_run_count - if pending_run_count == 0: - return - inner_text = ''.join(pending_run_texts) - if text_size.params.width > 0: - params = TextSizingParams( - scale, - pending_run_count, - text_size.params.numerator, - text_size.params.denominator, - text_size.params.vertical_align, - text_size.params.horizontal_align) - else: - params = TextSizingParams( - scale, - 0, - text_size.params.numerator, - text_size.params.denominator, - text_size.params.vertical_align, - text_size.params.horizontal_align) - ts = TextSizing(params, inner_text, text_size.terminator) - _append_seq(ts.make_sequence()) - pending_run_texts = [] - pending_run_count = 0 - - for unit_text, unit_scaled_w in units: - unit_start = pos - unit_end = pos + unit_scaled_w - if unit_end <= rel_start: - pos = unit_end - continue - if unit_start >= rel_end: - break - overlap = min(unit_end, rel_end) - max(unit_start, rel_start) - - # If overlap covers entire unit, include it in pending run. - if overlap == unit_scaled_w and unit_scaled_w > 0: - pending_run_texts.append(unit_text) - pending_run_count += 1 - else: - # Partial unit or gap: flush pending run and emit fillchars - emit_pending_run() - if overlap > 0: - # absolute start column of this overlap inside the ts block - abs_start = col + max(unit_start, rel_start) - _append_visible(fillchar * overlap, overlap, abs_start) - - pos = unit_end - - # flush remaining run if any - emit_pending_run() - - col += ts_width - else: - # XXX nothing to clip? TODO breakpoint() and verify - col += ts_width - idx = ts_match.end() + col, visible_count = _text_sizing_clip( + TextSizing.from_match(ts_match), + col=col, start=start, end=end, + output_tokens=output_tokens, + visible_count=visible_count, + fillchar=fillchar, ambiguous_width=ambiguous_width, + ) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + idx = match.end() continue + # Other zero-width sequences (OSC hyperlinks, etc.) — preserve as-is + _append_seq(seq) + idx = match.end() + continue + # 2. Handle bare ESC (not a valid sequence) if char == '\x1b': _append_seq(char) @@ -686,3 +611,87 @@ def emit_pending_run(): result += '\x1b[0m' return result + + +def _text_sizing_clip( + ts: TextSizing, + *, + col: int, + start: int, + end: int, + output_tokens: list[tuple], + visible_count: int, + fillchar: str = ' ', + ambiguous_width: int = 1, +) -> tuple[int, int]: + """ + Emit tokens for a text-sizing sequence into ``output_tokens``, clipped to ``[start, end)``. + + Returns ``(new_col, new_visible_count)``. + + This was formerly ``TextSizing.clip()`` in :mod:`wcwidth.text_sizing`. It was moved here to + break a circular dependency loop (:mod:`text_sizing` imported :mod:`_width`, and :mod:`_width` + imported :mod:`text_sizing`). + """ + # pylint: disable=too-many-locals + ts_width = ts.display_width(ambiguous_width) + if col >= start and col + ts_width <= end: + output_tokens.append(('seq', ts.make_sequence())) + return col + ts_width, visible_count + if col >= end or col + ts_width <= start: + return col + ts_width, visible_count + + # Partial overlap: decompose into units (graphemes at `scale` cells each), + # emit whole units as sequences and partial units as fillchars. + rel_start = max(0, start - col) + rel_end = min(end, col + ts_width) - col + scale = ts.params.scale + + units: list[tuple[str, int]] = [] + if ts.params.width > 0: + inner_graphemes = list(iter_graphemes(ts.text)) + for j in range(ts.params.width): + g = inner_graphemes[j] if j < len(inner_graphemes) else '' + units.append((g, scale)) + else: + for g in iter_graphemes(ts.text): + units.append((g, width(g, ambiguous_width=ambiguous_width) * scale)) + + pos = 0 + pending_texts: list[str] = [] + + def flush(): + if not pending_texts: + return + params = TextSizingParams( + scale, + len(pending_texts) if ts.params.width > 0 else 0, + ts.params.numerator, + ts.params.denominator, + ts.params.vertical_align, + ts.params.horizontal_align) + output_tokens.append( + ('seq', TextSizing(params, ''.join(pending_texts), ts.terminator).make_sequence())) + pending_texts.clear() + + for unit_text, unit_w in units: + unit_start = pos + unit_end = pos + unit_w + if unit_end <= rel_start: + pos = unit_end + continue + if unit_start >= rel_end: + break + overlap = min(unit_end, rel_end) - max(unit_start, rel_start) + if overlap == unit_w and unit_w > 0: + pending_texts.append(unit_text) + else: + flush() + if overlap > 0: + abs_start = col + max(unit_start, rel_start) + output_tokens.append(('vis', fillchar * overlap, overlap, abs_start)) + visible_count += overlap + pos = unit_end + + flush() + return col + ts_width, visible_count From 90e90f8c5801a9f25d5dc78790d6e360d8596bdc Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Wed, 29 Apr 2026 14:21:11 -0400 Subject: [PATCH 23/70] annoying copilot linting --- tests/test_text_sizing.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/tests/test_text_sizing.py b/tests/test_text_sizing.py index 86757e0..f4f8efe 100644 --- a/tests/test_text_sizing.py +++ b/tests/test_text_sizing.py @@ -4,8 +4,7 @@ import pytest # local -import wcwidth -from wcwidth import TextSizing, TextSizingParams +from wcwidth import TextSizing, TextSizingParams, width, wcswidth, clip, iter_sequences, strip_sequences from wcwidth.text_sizing import TEXT_FIELD_MAPPING from wcwidth.escape_sequences import TEXT_SIZING_PATTERN @@ -52,7 +51,7 @@ def test_text_sizing_width_control_codes(given_params, expected_remainder, expec seq2 = '\x1b]66;' + given_params + ';ABC' + '\x1b\\' for seq in (seq1, seq2): with pytest.raises(ValueError) as exc_info: - wcwidth.width(seq, control_codes='strict') + width(seq, control_codes='strict') assert exc_info.value.args[0].startswith(expected_exc) @@ -87,16 +86,16 @@ def test_text_sizing_width(params, text, expected_width): assert TextSizing.from_match(ts_match2) == TextSizing(params, text, terminator='\x1b\\') # and external width(), - assert wcwidth.width(seq1) == expected_width - assert wcwidth.width(seq2) == expected_width + assert width(seq1) == expected_width + assert width(seq2) == expected_width # verify 'strict' does not raise ValueError - wcwidth.width(seq1, control_codes='strict') - wcwidth.width(seq2, control_codes='strict') + width(seq1, control_codes='strict') + width(seq2, control_codes='strict') # and verify 'ignore' measures only inner_text (does not parse scale or width) - assert wcwidth.width(seq1, control_codes='ignore') == wcwidth.wcswidth(text) - assert wcwidth.width(seq2, control_codes='ignore') == wcwidth.wcswidth(text) + assert width(seq1, control_codes='ignore') == wcswidth(text) + assert width(seq2, control_codes='ignore') == wcswidth(text) @pytest.mark.parametrize('given_sequence,expected_text,expected_params,expected_width', [ @@ -121,9 +120,9 @@ def test_text_sizing_sequence(given_sequence, expected_text, expected_params, ex text_size = TextSizing.from_match(ts_match) assert text_size.params.make_sequence() == expected_params assert text_size.text == expected_text - assert wcwidth.width(given_sequence, control_codes='parse') == expected_width - assert wcwidth.width(given_sequence, control_codes='strict') == expected_width - assert wcwidth.width(given_sequence, control_codes='ignore') == wcwidth.wcswidth(expected_text) + assert width(given_sequence, control_codes='parse') == expected_width + assert width(given_sequence, control_codes='strict') == expected_width + assert width(given_sequence, control_codes='ignore') == wcswidth(expected_text) @pytest.mark.parametrize('text,expected', [ @@ -141,8 +140,8 @@ def test_text_sizing_sequence(given_sequence, expected_text, expected_params, ex ]) def test_strings_with_text_sizing(text, expected): """Verify measured width strings containing OSC66.""" - assert wcwidth.width(text) == expected - assert wcwidth.width(text, control_codes='strict') == expected + assert width(text) == expected + assert width(text, control_codes='strict') == expected @pytest.mark.parametrize('text,expected', [ @@ -155,7 +154,7 @@ def test_strings_with_text_sizing(text, expected): ('\x1b]66;w=1;A\x07\x1b]66;w=1;B\x07', 'AB'), ]) def test_strip_strings_with_text_sizing(text, expected): - assert wcwidth.strip_sequences(text) == expected + assert strip_sequences(text) == expected @pytest.mark.parametrize('text,expected_segs', [ @@ -163,7 +162,7 @@ def test_strip_strings_with_text_sizing(text, expected): ('abc\x1b]66;s=2;n=1,d=2,w=3;hello\x1b\\def', [('abc', False), ('\x1b]66;s=2;n=1,d=2,w=3;hello\x1b\\', True), ('def', False)]), ]) def test_iter_sequences_text_sizing(text, expected_segs): - assert list(wcwidth.iter_sequences(text)) == expected_segs + assert list(iter_sequences(text)) == expected_segs @pytest.mark.parametrize('text,start,end,expected', [ @@ -177,7 +176,7 @@ def test_iter_sequences_text_sizing(text, expected_segs): ]) def test_clip_text_sizing_basic(text, start, end, expected): """Test basic support of clip() with text sizing sequence.""" - assert repr(wcwidth.clip(text, start, end)) == repr(expected) + assert repr(clip(text, start, end)) == repr(expected) @pytest.mark.parametrize('text,start,end,expected', [ @@ -191,7 +190,7 @@ def test_clip_text_sizing_basic(text, start, end, expected): ]) def test_clip_text_sizing_scaled(text, start, end, expected): """Test support of clip() with scale=N arguments.""" - assert repr(wcwidth.clip(text, start, end)) == repr(expected) + assert repr(clip(text, start, end)) == repr(expected) @pytest.mark.parametrize('text,start,end,expected', [ @@ -272,4 +271,4 @@ def test_clip_text_sizing_scaled(text, start, end, expected): ]) def test_clip_text_sizing_scaled_with_fillchar(text, start, end, expected): """Test support of clip() with scale=N and fillchar is needed to fill remainder.""" - assert repr(wcwidth.clip(text, start, end, fillchar='.')) == repr(expected) + assert repr(clip(text, start, end, fillchar='.')) == repr(expected) From 59c25aecb56f95d15b847e768217455316bfac1a Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Wed, 29 Apr 2026 14:28:30 -0400 Subject: [PATCH 24/70] lint/docformatter --- tests/test_clip_cursors.py | 2 +- tests/test_text_sizing.py | 9 +++++++-- tox.ini | 3 +-- wcwidth/table_ambiguous.py | 1 + wcwidth/table_grapheme.py | 1 + wcwidth/table_mc.py | 1 + wcwidth/table_vs16.py | 1 + wcwidth/table_wide.py | 1 + wcwidth/table_zero.py | 1 + 9 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/test_clip_cursors.py b/tests/test_clip_cursors.py index 93bd743..4e59343 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_cursors.py @@ -46,4 +46,4 @@ def test_clip_cursor_sequences_expected_behaviour(text, start, end, expected): the clipped output if the moved-to columns are within the clip window; cursor-left allows subsequent characters to overwrite previous content and the clip should reflect that. """ - assert repr(clip(text, start, end)) == repr(expected) \ No newline at end of file + assert repr(clip(text, start, end)) == repr(expected) diff --git a/tests/test_text_sizing.py b/tests/test_text_sizing.py index f4f8efe..ff634ea 100644 --- a/tests/test_text_sizing.py +++ b/tests/test_text_sizing.py @@ -4,12 +4,17 @@ import pytest # local -from wcwidth import TextSizing, TextSizingParams, width, wcswidth, clip, iter_sequences, strip_sequences +from wcwidth import (TextSizing, + TextSizingParams, + clip, + width, + wcswidth, + iter_sequences, + strip_sequences) from wcwidth.text_sizing import TEXT_FIELD_MAPPING from wcwidth.escape_sequences import TEXT_SIZING_PATTERN _W_HI = TEXT_FIELD_MAPPING['w'].high -_S_HI = TEXT_FIELD_MAPPING['s'].high _N_HI = TEXT_FIELD_MAPPING['n'].high _D_HI = TEXT_FIELD_MAPPING['d'].high diff --git a/tox.ini b/tox.ini index b2150cf..df77374 100644 --- a/tox.ini +++ b/tox.ini @@ -165,8 +165,7 @@ commands = pydocstyle --source --explain {toxinidir}/wcwidth [testenv:docformatter] basepython = python3.13 -deps = docformatter>=1.7.7 - untokenize +deps = docformatter>=1.7.8 commands = docformatter --in-place --recursive --pre-summary-newline \ --wrap-summaries=100 --wrap-descriptions=100 \ {toxinidir}/wcwidth/ {toxinidir}/bin {toxinidir}/tests/ diff --git a/wcwidth/table_ambiguous.py b/wcwidth/table_ambiguous.py index 87403b7..2c40498 100644 --- a/wcwidth/table_ambiguous.py +++ b/wcwidth/table_ambiguous.py @@ -3,6 +3,7 @@ This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. """ + # pylint: disable=duplicate-code AMBIGUOUS_EASTASIAN = { '17.0.0': ( diff --git a/wcwidth/table_grapheme.py b/wcwidth/table_grapheme.py index ebdb1e2..d265b66 100644 --- a/wcwidth/table_grapheme.py +++ b/wcwidth/table_grapheme.py @@ -6,6 +6,7 @@ This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ + # pylint: disable=duplicate-code GRAPHEME_CR = ( diff --git a/wcwidth/table_mc.py b/wcwidth/table_mc.py index 557dc95..63acce9 100644 --- a/wcwidth/table_mc.py +++ b/wcwidth/table_mc.py @@ -3,6 +3,7 @@ This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ + # pylint: disable=duplicate-code CATEGORY_MC = { '17.0.0': ( diff --git a/wcwidth/table_vs16.py b/wcwidth/table_vs16.py index 49abce8..a5fc0a8 100644 --- a/wcwidth/table_vs16.py +++ b/wcwidth/table_vs16.py @@ -3,6 +3,7 @@ This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. """ + # pylint: disable=duplicate-code VS16_NARROW_TO_WIDE = { '9.0.0': ( diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py index 692868a..9d2ebd5 100644 --- a/wcwidth/table_wide.py +++ b/wcwidth/table_wide.py @@ -3,6 +3,7 @@ This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ + # pylint: disable=duplicate-code WIDE_EASTASIAN = { '17.0.0': ( diff --git a/wcwidth/table_zero.py b/wcwidth/table_zero.py index 133103c..b669f70 100644 --- a/wcwidth/table_zero.py +++ b/wcwidth/table_zero.py @@ -3,6 +3,7 @@ This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ + # pylint: disable=duplicate-code ZERO_WIDTH = { '17.0.0': ( From 3a1670218160dde41b6e9767154bc6b75ad09616 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Wed, 29 Apr 2026 15:27:20 -0400 Subject: [PATCH 25/70] Refactor -- no API or behavior changes - Remove all late-binding / lazy imports from inside functions - Tests and tooling updated to Python 3.14. - linting formatting according to the new tools - PyPy test support dropped: this library does not perform any novel use of python internals, and does not benefit from the extra testing. --- docs/api.rst | 4 - tests/test_text_sizing.py | 279 ------------------------------------ wcwidth/__init__.py | 5 +- wcwidth/_width.py | 9 +- wcwidth/escape_sequences.py | 17 +-- wcwidth/text_sizing.py | 196 ------------------------- wcwidth/wcwidth.py | 112 ++------------- 7 files changed, 12 insertions(+), 610 deletions(-) delete mode 100644 tests/test_text_sizing.py delete mode 100644 wcwidth/text_sizing.py diff --git a/docs/api.rst b/docs/api.rst index a80eb40..55d288b 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -36,8 +36,4 @@ requirements.txt or equivalent. Their signatures will never change. .. autofunction:: wcwidth.list_versions -.. autofunction:: wcwidth.TextSizing - -.. autofunction:: wcwidth.TextSizingParams - .. _SEMVER: https://semver.org diff --git a/tests/test_text_sizing.py b/tests/test_text_sizing.py deleted file mode 100644 index ff634ea..0000000 --- a/tests/test_text_sizing.py +++ /dev/null @@ -1,279 +0,0 @@ -"""Tests for Text Sizing Protocol (OSC 66) support.""" - -# 3rd party -import pytest - -# local -from wcwidth import (TextSizing, - TextSizingParams, - clip, - width, - wcswidth, - iter_sequences, - strip_sequences) -from wcwidth.text_sizing import TEXT_FIELD_MAPPING -from wcwidth.escape_sequences import TEXT_SIZING_PATTERN - -_W_HI = TEXT_FIELD_MAPPING['w'].high -_N_HI = TEXT_FIELD_MAPPING['n'].high -_D_HI = TEXT_FIELD_MAPPING['d'].high - -CONTROL_CODES_PARAMS_CASES = [ - ('x=2', "", "Unknown text sizing field 'x' in "), - ('s=3:x=3', "s=3", "Unknown text sizing field 'x' in "), - ('s=2:x=3:w=9', f"s=2:w={_W_HI}", "Unknown text sizing field 'x' in "), - ('xyz=2', "", "Unknown text sizing field 'xyz' in "), - ('xxx', "", "Expected '=' in text sizing parameter"), - ('s=xxx', "", "Illegal text sizing value 'xxx' in "), - ('s=-99', "", "Out of bounds text sizing value '-99' in "), - ('s=99', f"s={_W_HI}", "Out of bounds text sizing value '99' in "), - ('w=-1', "", "Out of bounds text sizing value '-1' in "), - ('w=8', f"w={_W_HI}", "Out of bounds text sizing value '8' in "), - ('n=20', f"n={_N_HI}", "Out of bounds text sizing value '20' in "), - ('d=99', f"d={_D_HI}", "Out of bounds text sizing value '99' in "), - ('v=5', "v=2", "Out of bounds text sizing value '5' in "), - ('h=3', "h=2", "Out of bounds text sizing value '3' in "), -] - - -@pytest.mark.parametrize('given_params,expected_remainder,expected_exc,', CONTROL_CODES_PARAMS_CASES) -def test_text_sizing_params_control_codes(given_params, expected_remainder, expected_exc): - """Verify control_codes='strict' and 'parse' behavior in TextSizingParams.from_params().""" - # assert control_codes='strict' raises expected exception, - with pytest.raises(ValueError) as exc_info: - TextSizingParams.from_params(given_params, control_codes='strict') - assert exc_info.value.args[0].startswith(expected_exc) - - # when 'parse' (default), any illegal argument or value is filtered, excluded, or clipped - params = TextSizingParams.from_params(given_params) - assert params.make_sequence() == expected_remainder - - -@pytest.mark.parametrize('given_params,expected_remainder,expected_exc,', CONTROL_CODES_PARAMS_CASES) -def test_text_sizing_width_control_codes(given_params, expected_remainder, expected_exc): - """Verify control_codes='strict' with invalid OSC 66 sequences in wciwdth.width().""" - seq1 = '\x1b]66;' + given_params + ';ABC' + '\x07' - seq2 = '\x1b]66;' + given_params + ';ABC' + '\x1b\\' - for seq in (seq1, seq2): - with pytest.raises(ValueError) as exc_info: - width(seq, control_codes='strict') - assert exc_info.value.args[0].startswith(expected_exc) - - -@pytest.mark.parametrize('params,text,expected_width', [ - # cases of static width=N values, - (TextSizingParams(scale=2, width=1), 'climclam', 2), - (TextSizingParams(scale=2, width=3), 'anything', 6), - (TextSizingParams(scale=1, width=5), '', 5), - (TextSizingParams(scale=3, width=1), 'x', 3), - # and automatic width (width=0) values, - (TextSizingParams(), '', 0), - (TextSizingParams(), 'AB', 2), - (TextSizingParams(), '中', 2), - (TextSizingParams(scale=2), 'AB', 4), - (TextSizingParams(scale=2), '中', 4), - (TextSizingParams(scale=3), '', 0), - (TextSizingParams(scale=7, width=7, numerator=15, denominator=15, - vertical_align=2, horizontal_align=2), 'x!yzzy', 49), -]) -def test_text_sizing_width(params, text, expected_width): - """Verify width using with both kinds of terminator.""" - # verify internal TextSizing.display_width() result, - assert TextSizing(params, text, terminator='\x07').display_width() == expected_width - assert TextSizing(params, text, terminator='\x1b\\').display_width() == expected_width - seq1 = TextSizing(params, text, terminator='\x07').make_sequence() - seq2 = TextSizing(params, text, terminator='\x1b\\').make_sequence() - - # verify round-trip - ts_match1, ts_match2 = TEXT_SIZING_PATTERN.match(seq1), TEXT_SIZING_PATTERN.match(seq2) - assert ts_match1 and ts_match2 - assert TextSizing.from_match(ts_match1) == TextSizing(params, text, terminator='\x07') - assert TextSizing.from_match(ts_match2) == TextSizing(params, text, terminator='\x1b\\') - - # and external width(), - assert width(seq1) == expected_width - assert width(seq2) == expected_width - - # verify 'strict' does not raise ValueError - width(seq1, control_codes='strict') - width(seq2, control_codes='strict') - - # and verify 'ignore' measures only inner_text (does not parse scale or width) - assert width(seq1, control_codes='ignore') == wcswidth(text) - assert width(seq2, control_codes='ignore') == wcswidth(text) - - -@pytest.mark.parametrize('given_sequence,expected_text,expected_params,expected_width', [ - ('\x1b]66;s=2:w=2;AB\x07', 'AB', 's=2:w=2', 4), - ('\x1b]66;s=2:w=2;\u4e2d\x07', '\u4e2d', 's=2:w=2', 4), - ('\x1b]66;s=3:w=1;x\x07', 'x', 's=3:w=1', 3), - ('\x1b]66;w=5;hello\x07', 'hello', 'w=5', 5), - ('\x1b]66;s=2:w=3;anything\x07', 'anything', 's=2:w=3', 6), - ('\x1b]66;w=3;x\x07', 'x', 'w=3', 3), - ('\x1b]66;s=1;AB\x07', 'AB', '', 2), - ('\x1b]66;s=2;AB\x07', 'AB', 's=2', 4), - ('\x1b]66;s=2;中\x07', '中', 's=2', 4), - ('\x1b]66;s=2;\x07', '', 's=2', 0), - ('\x1b]66;s=1:w=1;\x07', '', 'w=1', 1), - ('\x1b]66;w=2;A\x07', 'A', 'w=2', 2), - ('\x1b]66;s=2:w=3;text\x1b\\', 'text', 's=2:w=3', 6), -]) -def test_text_sizing_sequence(given_sequence, expected_text, expected_params, expected_width): - """Verify parsing and measured width of raw OSC 66 sequence.""" - ts_match = TEXT_SIZING_PATTERN.match(given_sequence) - assert ts_match is not None - text_size = TextSizing.from_match(ts_match) - assert text_size.params.make_sequence() == expected_params - assert text_size.text == expected_text - assert width(given_sequence, control_codes='parse') == expected_width - assert width(given_sequence, control_codes='strict') == expected_width - assert width(given_sequence, control_codes='ignore') == wcswidth(expected_text) - - -@pytest.mark.parametrize('text,expected', [ - ('\x1b]66;s=2:w=3:n=1:d=2:v=1:h=2;x!yzzy\x1b\\', 6), - ('\x1b]66;s=2:w=3;anything\x07', 6), - ('\x1b]66;w=3;x\x07', 3), - ('\x1b]66;s=1:w=0;AB\x07', 2), - ('\x1b]66;s=2:w=0;AB\x07', 4), - ('\x1b]66;s=2:w=0;\u4e2d\x07', 4), # '中' - ('\x1b]66;s=1:w=0;\x07', 0), - ('abc\x1b]66;w=3;x\x07def', 9), - ('\x1b]66;w=2;A\x07\x1b]66;w=3;B\x07', 5), - ('\x1b]66;s=2:w=3;text\x1b\\', 6), - ('\x1b[31m\x1b]66;w=2;AB\x07\x1b[0m', 2), -]) -def test_strings_with_text_sizing(text, expected): - """Verify measured width strings containing OSC66.""" - assert width(text) == expected - assert width(text, control_codes='strict') == expected - - -@pytest.mark.parametrize('text,expected', [ - ('\x1b]66;s=2;hello\x07', 'hello'), - ('\x1b]66;s=2;hello\x1b\\', 'hello'), - ('\x1b]66;;text\x07', 'text'), - ('\x1b]66;s=3:w=2;\x07', ''), - ('abc\x1b]66;w=2;XY\x07def', 'abcXYdef'), - ('\x1b[31m\x1b]66;s=2;red\x07\x1b[0m', 'red'), - ('\x1b]66;w=1;A\x07\x1b]66;w=1;B\x07', 'AB'), -]) -def test_strip_strings_with_text_sizing(text, expected): - assert strip_sequences(text) == expected - - -@pytest.mark.parametrize('text,expected_segs', [ - ('abc\x1b]66;s=2;hello\x07def', [('abc', False), ('\x1b]66;s=2;hello\x07', True), ('def', False)]), - ('abc\x1b]66;s=2;n=1,d=2,w=3;hello\x1b\\def', [('abc', False), ('\x1b]66;s=2;n=1,d=2,w=3;hello\x1b\\', True), ('def', False)]), -]) -def test_iter_sequences_text_sizing(text, expected_segs): - assert list(iter_sequences(text)) == expected_segs - - -@pytest.mark.parametrize('text,start,end,expected', [ - ('\x1b]66;w=3;ABC\x07', 0, 3, '\x1b]66;w=3;ABC\x07'), - ('\x1b]66;w=3;ABC\x07', 0, 2, '\x1b]66;w=2;AB\x07'), - ('\x1b]66;w=3;ABC\x07', 1, 3, '\x1b]66;w=2;BC\x07'), - ('ab\x1b]66;w=2;XY\x07cd', 0, 6, 'ab\x1b]66;w=2;XY\x07cd'), - ('ab\x1b]66;w=2;XY\x07cd', 0, 3, 'ab\x1b]66;w=1;X\x07'), - ('ab\x1b]66;w=2;XY\x07cd', 3, 6, '\x1b]66;w=1;Y\x07cd'), - ('ab\x1b]66;w=2;XY\x07cd', 4, 6, 'cd'), -]) -def test_clip_text_sizing_basic(text, start, end, expected): - """Test basic support of clip() with text sizing sequence.""" - assert repr(clip(text, start, end)) == repr(expected) - - -@pytest.mark.parametrize('text,start,end,expected', [ - ('\x1b]66;s=2;ABC\x07', 0, 0, ''), - ('\x1b]66;s=2;ABC\x07', 6, 6, ''), - ('\x1b]66;s=2;ABC\x07', 0, 2, '\x1b]66;s=2;A\x07'), - ('\x1b]66;s=2;ABC\x07', 0, 4, '\x1b]66;s=2;AB\x07'), - ('\x1b]66;s=2;ABC\x07', 0, 6, '\x1b]66;s=2;ABC\x07'), - ('\x1b]66;s=2;ABC\x07', 2, 6, '\x1b]66;s=2;BC\x07'), - ('\x1b]66;s=2;ABC\x07', 4, 6, '\x1b]66;s=2;C\x07'), -]) -def test_clip_text_sizing_scaled(text, start, end, expected): - """Test support of clip() with scale=N arguments.""" - assert repr(clip(text, start, end)) == repr(expected) - - -@pytest.mark.parametrize('text,start,end,expected', [ - # a b c - # === === === - # 012 345 678 - # . - # .. - # *a* - # *a* . - # ... *b* - # ... *b* . - # ... *b* .. - # ... *b* *c* - ('\x1b]66;s=3;ABC\x07', 0, 0, ''), - ('\x1b]66;s=3;ABC\x07', 0, 1, '.'), - ('\x1b]66;s=3;ABC\x07', 0, 2, '..'), - ('\x1b]66;s=3;ABC\x07', 0, 3, '\x1b]66;s=3;A\x07'), - ('\x1b]66;s=3;ABC\x07', 0, 4, '\x1b]66;s=3;A\x07.'), - ('\x1b]66;s=3;ABC\x07', 0, 5, '\x1b]66;s=3;A\x07..'), - ('\x1b]66;s=3;ABC\x07', 0, 6, '\x1b]66;s=3;AB\x07'), - ('\x1b]66;s=3;ABC\x07', 0, 7, '\x1b]66;s=3;AB\x07.'), - ('\x1b]66;s=3;ABC\x07', 0, 8, '\x1b]66;s=3;AB\x07..'), - ('\x1b]66;s=3;ABC\x07', 0, 9, '\x1b]66;s=3;ABC\x07'), - ('\x1b]66;s=3;ABC\x07', 0, 10, '\x1b]66;s=3;ABC\x07'), - # a b - # === === === - # 012 345 678 - # . 1, 2 - # .. 1, 3 - # .. . 1, 4 - # .. .. 1, 5 - # .. *b* 1, 6 - # .. *b* . 1, 7 - # .. *b* .. 1, 8 - # .. *b* *c* 1, 9 - ('\x1b]66;s=3;ABC\x07', 1, 1, ''), - ('\x1b]66;s=3;ABC\x07', 1, 2, '.'), - ('\x1b]66;s=3;ABC\x07', 1, 3, '..'), - ('\x1b]66;s=3;ABC\x07', 1, 4, '...'), - ('\x1b]66;s=3;ABC\x07', 1, 5, '....'), - ('\x1b]66;s=3;ABC\x07', 1, 6, '..\x1b]66;s=3;B\x07'), - ('\x1b]66;s=3;ABC\x07', 1, 7, '..\x1b]66;s=3;B\x07.'), - ('\x1b]66;s=3;ABC\x07', 1, 8, '..\x1b]66;s=3;B\x07..'), - ('\x1b]66;s=3;ABC\x07', 1, 9, '..\x1b]66;s=3;BC\x07'), - ('\x1b]66;s=3;ABC\x07', 1, 10, '..\x1b]66;s=3;BC\x07'), - # two-thirds of string 'A' and half of string 'B' is fillchar - # ('\x1b]66;s=3;ABC\x07', 2, 4, '..'), - # half of string 'A' and all of string 'B' - # a b - # === === === - # 012 345 678 - # . 2, 3 - # . . 2, 4 - # . .. 2, 5 - # . *b* 2, 6 - # . *b* . 2, 7 - # . *b* .. 2, 8 - # . *b* *c* 2, 9 - ('\x1b]66;s=3;ABC\x07', 2, 2, ''), - ('\x1b]66;s=3;ABC\x07', 2, 3, '.'), - ('\x1b]66;s=3;ABC\x07', 2, 4, '..'), - ('\x1b]66;s=3;ABC\x07', 2, 5, '...'), - ('\x1b]66;s=3;ABC\x07', 2, 6, '.\x1b]66;s=3;B\x07'), - ('\x1b]66;s=3;ABC\x07', 2, 7, '.\x1b]66;s=3;B\x07.'), - ('\x1b]66;s=3;ABC\x07', 2, 8, '.\x1b]66;s=3;B\x07..'), - ('\x1b]66;s=3;ABC\x07', 2, 9, '.\x1b]66;s=3;BC\x07'), - ('\x1b]66;s=3;ABC\x07', 2, 10, '.\x1b]66;s=3;BC\x07'), - # and now 3:10, should be easy ... - ('\x1b]66;s=3;ABC\x07', 3, 3, ''), - ('\x1b]66;s=3;ABC\x07', 3, 4, '.'), - ('\x1b]66;s=3;ABC\x07', 3, 5, '..'), - ('\x1b]66;s=3;ABC\x07', 3, 6, '\x1b]66;s=3;B\x07'), - ('\x1b]66;s=3;ABC\x07', 3, 7, '\x1b]66;s=3;B\x07.'), - ('\x1b]66;s=3;ABC\x07', 3, 8, '\x1b]66;s=3;B\x07..'), - ('\x1b]66;s=3;ABC\x07', 3, 9, '\x1b]66;s=3;BC\x07'), - ('\x1b]66;s=3;ABC\x07', 3, 10, '\x1b]66;s=3;BC\x07'), -]) -def test_clip_text_sizing_scaled_with_fillchar(text, start, end, expected): - """Test support of clip() with scale=N and fillchar is needed to fill remainder.""" - assert repr(clip(text, start, end, fillchar='.')) == repr(expected) diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 2205d77..fe9f002 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -31,14 +31,13 @@ from .grapheme import iter_graphemes, iter_graphemes_reverse from .textwrap import SequenceTextWrapper, wrap from .sgr_state import propagate_sgr -from .text_sizing import TextSizing, TextSizingParams # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". __all__ = ('wcwidth', 'wcswidth', 'width', 'iter_sequences', 'iter_graphemes', 'iter_graphemes_reverse', 'grapheme_boundary_before', 'ljust', 'rjust', 'center', 'wrap', 'clip', 'strip_sequences', - 'list_versions', 'propagate_sgr', 'TextSizing', 'TextSizingParams') + 'list_versions', 'propagate_sgr') # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places @@ -47,7 +46,7 @@ # std imports from importlib import metadata as importlib_metadata except ImportError: # pragma: no cover - fallback for very old Pythons - importlib_metadata = None + importlib_metadata = None # type: ignore[assignment] if importlib_metadata is not None: try: diff --git a/wcwidth/_width.py b/wcwidth/_width.py index 9dc68ed..90105c1 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -12,11 +12,9 @@ _FITZPATRICK_RANGE, _REGIONAL_INDICATOR_SET) from .table_vs16 import VS16_NARROW_TO_WIDE -from .text_sizing import TextSizing from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL from .table_grapheme import ISC_CONSONANT from .escape_sequences import (ZERO_WIDTH_PATTERN, - TEXT_SIZING_PATTERN, CURSOR_LEFT_SEQUENCE, CURSOR_RIGHT_SEQUENCE, INDETERMINATE_EFFECT_SEQUENCE, @@ -126,8 +124,7 @@ def width( # Check for escape sequences that can't be ignored, if present if '\x1b' not in text or ( not CURSOR_RIGHT_SEQUENCE.search(text) and - not CURSOR_LEFT_SEQUENCE.search(text) and - not TEXT_SIZING_PATTERN.search(text) + not CURSOR_LEFT_SEQUENCE.search(text) ): control_codes = 'ignore' @@ -169,10 +166,6 @@ def width( elif (left := CURSOR_LEFT_SEQUENCE.match(seq)): current_col = max(0, current_col - int(left.group(1) or 1)) - # Or OSC 66 (kitty text sizing) - elif (ts_match := TEXT_SIZING_PATTERN.match(seq)): - text_size = TextSizing.from_match(ts_match, control_codes=control_codes) - current_col += text_size.display_width(ambiguous_width) idx = match.end() else: # Errant ESC or unknown sequence: only the first character is zero-width diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index e9d5734..67f6a63 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -11,20 +11,12 @@ import typing -# Text Sizing Protocol (OSC 66) — has positive width, must be checked before ZERO_WIDTH_PATTERN. -# Groups: (1) metadata, (2) inner text, (3) terminator (BEL or ST). -# https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ -TEXT_SIZING_PATTERN = re.compile( - r'\x1b\]66;([^;\x07\x1b]*);([^\x07\x1b]*)(\x07|\x1b\\)' -) - # Zero-width escape sequences (SGR, OSC, CSI, etc.). This table, like INDETERMINATE_EFFECT_SEQUENCE, # originated from the 'blessed' library. ZERO_WIDTH_PATTERN = re.compile( # CSI sequences r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]|' - # OSC sequences, note that text sizing protocol (OSC 66) is special case in width() and clip(), - # and contrary to the variable name, it is positive width. + # OSC sequences r'\x1b\][^\x07\x1b]*(?:\x07|\x1b\\)|' # APC sequences r'\x1b_[^\x1b\x07]*(?:\x07|\x1b\\)|' @@ -136,9 +128,6 @@ def strip_sequences(text: str) -> str: r""" Return text with all terminal escape sequences removed. - For sequences containing printable text, OSC 66 (Text sizing protocol) and OSC 8 (hyperlink), - the inner text is preserved. - Unknown or incomplete ESC sequences are preserved. :param text: String that may contain terminal escape sequences. @@ -154,11 +143,7 @@ def strip_sequences(text: str) -> str: 'hello' >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text') 'bold red text' - >>> strip_sequences('\x1b]66;s=2;hello\x07') - 'hello' >>> strip_sequences('\x1b]8;id=34;https://example.com\x1b\\[view]\x1b]8;;\x1b\\') '[view]' """ - if '\x1b]66;' in text: - text = TEXT_SIZING_PATTERN.sub(r'\2', text) return ZERO_WIDTH_PATTERN.sub('', text) diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py deleted file mode 100644 index 4b495e9..0000000 --- a/wcwidth/text_sizing.py +++ /dev/null @@ -1,196 +0,0 @@ -r""" -`kitty text sizing protocol`_ (OSC 66) parsing and measurement. - -The kitty text sizing protocol allows terminal apps to explicitly tell -terminals how many cells text occupies, using the escape sequence:: - - ESC ] 66 ; metadata ; text BEL/ST - -Metadata is colon-separated ``key=value`` pairs: - -- ``s``: scale -- ``w``: width in cells -- ``n``: fractional numerator -- ``d``: fractional denominator -- ``v``: vertical alignment -- ``h``: horizontal alignment - -Parsing is pretty straight-forward: - -- When ``w > 0``, return ``s * w``. -- Otherwise ``w == 0``, ``s * wcswidth(inner_text_width)`` cells. - -Numerator, denominator, and alignment codes and values are parsed but otherwise ignored -and have no effect on measurements made in this library. - -.. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ - -.. versionadded:: 0.6.1 -""" - -from __future__ import annotations - -# std imports -import re - -import typing - -# local -from ._wcswidth import wcswidth - - -class _FieldMeta(typing.NamedTuple): - name: str - low: int - high: int - default: int - - -TEXT_FIELD_MAPPING: dict[str, _FieldMeta] = { - 's': _FieldMeta(name='scale', low=1, high=7, default=1), - 'w': _FieldMeta(name='width', low=0, high=7, default=0), - 'n': _FieldMeta(name='numerator', low=0, high=15, default=0), - 'd': _FieldMeta(name='denominator', low=0, high=15, default=0), - 'v': _FieldMeta(name='vertical_align', low=0, high=2, default=0), - 'h': _FieldMeta(name='horizontal_align', low=0, high=2, default=0)} - - -class TextSizingParams(typing.NamedTuple): - """ - Parsed parameters from a text sizing escape sequence (OSC 66). - - :param scale: Scale factor (1-7). Text occupies ``scale`` rows tall and ``scale * width`` - columns wide. - :param width: Width in cells (0-7). When 0, width is auto-calculated from the inner text. - :param numerator: Fractional scaling numerator (0-15). - :param denominator: Fractional scaling denominator (0-15). - :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). - :param horizontal_align: Horizontal alignment (0=left, 1=right, 2=center). - """ - - scale: int = 1 - width: int = 0 - numerator: int = 0 - denominator: int = 0 - vertical_align: int = 0 - horizontal_align: int = 0 - - def __repr__(self): - """ - Return a compact representation including only non-default fields. - - This avoids verbose output when most fields are defaults. - """ - # modified to show values only when non-default - repr_fmt = ', '.join(f'{field.name}={getattr(self, field.name)}' - for field in TEXT_FIELD_MAPPING.values() - if getattr(self, field.name) != field.default) - return f'{self.__class__.__name__}({repr_fmt})' - - def make_sequence(self) -> str: - """Build and return sub-part of an OSC 66 sequence.""" - parts = [] - # build string for all known parameters of non-default values - for field_key, field in TEXT_FIELD_MAPPING.items(): - if (val := getattr(self, field.name)) != field.default: - parts.append(f'{field_key}={val}') - return ':'.join(parts) - - @classmethod - def from_params(cls, raw: str, control_codes: str = 'parse') -> TextSizingParams: - """ - Parse colon-separated ``key=value`` metadata string. - - :param raw: Metadata string, e.g. ``'s=2:w=3'``. - :param control_codes: 'parse' or 'strict'. - :raises ValueError: If ``control_codes='strict'`` unrecognized text sizing parameters raise - ValueError. - :returns: Parsed parameters with values clamped to valid ranges. - Unknown keys are ignored. Non-integer values use defaults. - - Example:: - - >>> TextSizingParams.from_params('s=2:w=3') - TextSizingParams(scale=2, width=3, numerator=0, denominator=0, \ - vertical_align=0, horizontal_align=0) - """ - kwargs: typing.Dict[str, int] = {} - if not raw: - return cls() - for part in raw.split(':'): - if '=' not in part: - if control_codes == 'strict': - raise ValueError(f"Expected '=' in text sizing parameter (key=val), " - f"got {part!r} in OSC 66 sequence, {raw!r}") - continue - key, _eq, val = part.partition('=') - field = TEXT_FIELD_MAPPING.get(key) - if field is None: - if control_codes == 'strict': - raise ValueError(f"Unknown text sizing field '{key}' " - f"in OSC 66 sequence, {raw!r}") - # ignore unknown fields unless 'strict' - continue - try: - value = int(val) - except ValueError as exc: - if control_codes == 'strict': - raise ValueError(f"Illegal text sizing value '{val}' " - f"in OSC 66 sequence, {raw!r}: {exc}") from exc - # ignore value, uses default value without warning unless 'strict' - continue - if control_codes == 'strict' and (value > field.high or value < field.low): - raise ValueError(f"Out of bounds text sizing value '{val}' " - f"in OSC 66 sequence, {raw!r}: " - f"allowed range for '{key}' ({field.name}) " - f"is {field.low} to {field.high}") - kwargs[field.name] = max(field.low, min(field.high, value)) - return cls(**kwargs) - - -class TextSizing(typing.NamedTuple): - """Basic horizontal width measurement for kitty text sizing protocol.""" - - params: TextSizingParams - text: str - terminator: str - - @classmethod - def from_match(cls, match: re.Match[str], control_codes: str = 'parse') -> TextSizing: - r""" - Parse using matching OSC 66 Sequence. - - :param match: match object from :attr:`wcwidth.escape_sequences.TEXT_SIZING_PATTERN`. - :param control_codes: 'parse' or 'strict', same meaning as delegated by - :func:`wcwidth.width`. - :raises ValueError: When ``control_codes='strict'`` for unrecognized, invalid, or out of - bounds text sizing parameters. - :returns: TextSizing object from parsed sequence - - Example:: - - from wcwidth.escape_sequences import TEXT_SIZING_PATTERN - >>> TextSizing.from_match(TEXT_SIZING_PATTERN.match('\x1b]66;w=2;XY\x07')) - TextSizing(params=TextSizingParams(scale=1, width=2, numerator=0, denominator=0, \ - vertical_align=0, horizontal_align=0), text='XY', terminator='\x07') - """ - return cls(params=TextSizingParams.from_params(match.group(1), control_codes=control_codes), - text=match.group(2), - terminator=match.group(3)) - - def display_width(self, ambiguous_width: int = 1) -> int: - """ - Calculate the display width of a text sizing sequence. - - :param ambiguous_width: Width for East Asian Ambiguous characters. - :returns: Display width in terminal cells. When ``width > 0``, returns ``params.scale * - params.width``. When ``width == 0``, returns ``params.scale * measured_inner_width``. - """ - if self.params.width > 0: - return self.params.scale * self.params.width - w = wcswidth(self.text, ambiguous_width=ambiguous_width) - return self.params.scale * max(0, w) - - def make_sequence(self) -> str: - """Build and return complete OSC 66 Terminal Sequence.""" - return f'\x1b]66;{self.params.make_sequence()};{self.text}{self.terminator}' diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 749ea58..bb2daf3 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -91,12 +91,10 @@ from .table_vs16 import VS16_NARROW_TO_WIDE from .table_wide import WIDE_EASTASIAN from .table_zero import ZERO_WIDTH -from .text_sizing import TextSizing, TextSizingParams from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL from .table_grapheme import ISC_CONSONANT from .table_ambiguous import AMBIGUOUS_EASTASIAN from .escape_sequences import (ZERO_WIDTH_PATTERN, - TEXT_SIZING_PATTERN, CURSOR_LEFT_SEQUENCE, CURSOR_RIGHT_SEQUENCE, INDETERMINATE_EFFECT_SEQUENCE, @@ -104,6 +102,12 @@ strip_sequences) from .unicode_versions import list_versions +# Type aliases for output_tokens used by clip(). +# ('vis', text, width_in_cols, start_col) or ('seq', seq_text) +VisToken = tuple[Literal['vis'], str, int, int] +SeqToken = tuple[Literal['seq'], str] +Token = VisToken | SeqToken + # Unlike wcwidth.__all__, wcwidth.wcwidth.__all__ is NOT for the purpose of defining a public API, # or what we prefer to be imported with statement, "from wcwidth.wcwidth import *". Explicitly # re-export imports here for no other reason than to satisfy the type checkers (mypy). Yak shavings. @@ -343,9 +347,6 @@ def clip( .. versionchanged:: 0.5.0 Added ``propagate_sgr`` parameter (default True). - .. versionchanged:: 0.6.1 - Parses OSC 66 Sequences. - Example:: >>> clip('hello world', 0, 5) @@ -383,7 +384,7 @@ def clip( # remove previously emitted visible characters while keeping the sequence order. # For visible tokens we store ('vis', text, width_in_columns) # For sequences we store ('seq', seq) - output_tokens: list[tuple[str, ...]] = [] + output_tokens: list[Token] = [] visible_count = 0 # number of visible columns emitted so far col = 0 idx = 0 @@ -425,6 +426,7 @@ def _remove_visible_tail(n: int) -> None: if i < 0: break tok = output_tokens[i] + assert tok[0] == 'vis' # guaranteed by while loop above tok_s = tok[1] tok_w = tok[2] tok_start = tok[3] @@ -499,20 +501,6 @@ def _remove_visible_tail(n: int) -> None: _remove_visible_tail(to_remove) idx = match.end() continue - if (ts_match := TEXT_SIZING_PATTERN.match(seq)): - # OSC 66 (text sizing) has positive width - col, visible_count = _text_sizing_clip( - TextSizing.from_match(ts_match), - col=col, start=start, end=end, - output_tokens=output_tokens, - visible_count=visible_count, - fillchar=fillchar, ambiguous_width=ambiguous_width, - ) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - idx = match.end() - continue - # Other zero-width sequences (OSC hyperlinks, etc.) — preserve as-is _append_seq(seq) idx = match.end() @@ -611,87 +599,3 @@ def _remove_visible_tail(n: int) -> None: result += '\x1b[0m' return result - - -def _text_sizing_clip( - ts: TextSizing, - *, - col: int, - start: int, - end: int, - output_tokens: list[tuple], - visible_count: int, - fillchar: str = ' ', - ambiguous_width: int = 1, -) -> tuple[int, int]: - """ - Emit tokens for a text-sizing sequence into ``output_tokens``, clipped to ``[start, end)``. - - Returns ``(new_col, new_visible_count)``. - - This was formerly ``TextSizing.clip()`` in :mod:`wcwidth.text_sizing`. It was moved here to - break a circular dependency loop (:mod:`text_sizing` imported :mod:`_width`, and :mod:`_width` - imported :mod:`text_sizing`). - """ - # pylint: disable=too-many-locals - ts_width = ts.display_width(ambiguous_width) - if col >= start and col + ts_width <= end: - output_tokens.append(('seq', ts.make_sequence())) - return col + ts_width, visible_count - if col >= end or col + ts_width <= start: - return col + ts_width, visible_count - - # Partial overlap: decompose into units (graphemes at `scale` cells each), - # emit whole units as sequences and partial units as fillchars. - rel_start = max(0, start - col) - rel_end = min(end, col + ts_width) - col - scale = ts.params.scale - - units: list[tuple[str, int]] = [] - if ts.params.width > 0: - inner_graphemes = list(iter_graphemes(ts.text)) - for j in range(ts.params.width): - g = inner_graphemes[j] if j < len(inner_graphemes) else '' - units.append((g, scale)) - else: - for g in iter_graphemes(ts.text): - units.append((g, width(g, ambiguous_width=ambiguous_width) * scale)) - - pos = 0 - pending_texts: list[str] = [] - - def flush(): - if not pending_texts: - return - params = TextSizingParams( - scale, - len(pending_texts) if ts.params.width > 0 else 0, - ts.params.numerator, - ts.params.denominator, - ts.params.vertical_align, - ts.params.horizontal_align) - output_tokens.append( - ('seq', TextSizing(params, ''.join(pending_texts), ts.terminator).make_sequence())) - pending_texts.clear() - - for unit_text, unit_w in units: - unit_start = pos - unit_end = pos + unit_w - if unit_end <= rel_start: - pos = unit_end - continue - if unit_start >= rel_end: - break - overlap = min(unit_end, rel_end) - max(unit_start, rel_start) - if overlap == unit_w and unit_w > 0: - pending_texts.append(unit_text) - else: - flush() - if overlap > 0: - abs_start = col + max(unit_start, rel_start) - output_tokens.append(('vis', fillchar * overlap, overlap, abs_start)) - visible_count += overlap - pos = unit_end - - flush() - return col + ts_width, visible_count From 131faeb60efed78829654431a1a6db41bf3dbc88 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Wed, 29 Apr 2026 22:04:51 -0400 Subject: [PATCH 26/70] stupid simplification/refactor --- tests/test_clip_cursors.py | 13 +++++++++- tests/test_textwrap.py | 33 ++++++++++++++++++++++++ wcwidth/__init__.py | 2 +- wcwidth/text_sizing.py | 2 +- wcwidth/wcwidth.py | 53 +++++++------------------------------- 5 files changed, 57 insertions(+), 46 deletions(-) diff --git a/tests/test_clip_cursors.py b/tests/test_clip_cursors.py index 4e59343..a357126 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_cursors.py @@ -19,6 +19,17 @@ ("hello\x1b[10Cworld", 0, 5, "hello"), # Cursor-left overwrites previous characters ("hello\x1b[2DXY", 0, 5, "helXY"), + # Cursor-left that removes entire visible token (tok_w <= to_remove path) + ("abc\x1b[3DXY", 0, 5, "XY"), + # Cursor-left at column 0 (prev_col not > col, no overwrite) + ("\x1b[2Dhi", 0, 2, "hi"), + # Cursor-left with no visible tokens emitted (to_remove <= 0 path) + ("\x1b[5C\x1b[2Dhi", 5, 7, ""), + # Cursor-left triggers _remove_visible_tail with seq tokens before vis token + # exercises the inner while loop that skips past seq tokens (line 422) + ("ab\x1b]8;;http://x.com\x07\x1b[2Dcd", 0, 4, "\x1b]8;;http://x.com\x07cd"), + # Cursor-left into wide char twice, second time on empty token triggers i < 0 break + ("中\x1b[D\x1b[Da", 0, 4, "a"), ('ab\x1b[5Ccd', 0, 4, 'ab '), ('abcde\x1b[2Df', 0, 6, 'abcf'), ('ab\x1b[10Ccd', 0, 4, 'ab '), @@ -46,4 +57,4 @@ def test_clip_cursor_sequences_expected_behaviour(text, start, end, expected): the clipped output if the moved-to columns are within the clip window; cursor-left allows subsequent characters to overwrite previous content and the clip should reflect that. """ - assert repr(clip(text, start, end)) == repr(expected) + assert repr(clip(text, start, end)) == repr(expected) \ No newline at end of file diff --git a/tests/test_textwrap.py b/tests/test_textwrap.py index ae2eb2d..b264472 100644 --- a/tests/test_textwrap.py +++ b/tests/test_textwrap.py @@ -485,3 +485,36 @@ def test_wrap_replace_whitespace_false_newlines_zero_width(): """Newlines have zero display width, so more text fits per line than stdlib.""" assert wrap('hello\nworld foo\nbar', 10, replace_whitespace=False) == [ 'hello\nworld', 'foo\nbar'] + + +# kitty text sizing protocol (OSC 66) constants for wrap() tests. +# Width calculation, BEL/ST/scale/auto-width/CJK/SGR interaction with OSC66 are +# already covered exhaustively in test_text_sizing.py and test_clip_*.py. These +# tests verify only the *line-breaking* behaviour that is unique to wrap(). +TS3 = '\x1b]66;w=3;XYZ\x07' # explicit width=3 + + +@pytest.mark.parametrize('text,w,expected', [ + # Greedy fill: atomic sequence moves to next line when line width exceeded + ('abc' + TS3 + 'def', 4, ['abc' + TS3 + 'd', 'ef']), + ('abc' + TS3 + 'def', 5, ['abc' + TS3 + 'de', 'f']), + ('abc' + TS3 + 'def', 6, ['abc', TS3 + 'def']), + ('abc' + TS3 + 'def', 8, ['abc', TS3 + 'def']), + ('abc' + TS3 + 'def', 10, ['abc' + TS3 + 'def']), + # Sequence stays with preceding word when total stripped width fits + ('aa' + TS3 + 'bb', 5, ['aa', TS3 + 'bb']), + ('pre' + TS3 + 'post', 8, ['pre', TS3 + 'post']), +]) +def test_wrap_ts_line_fill(text, w, expected): + """OSC 66 sequence width is respected and treated as atomic unit when filling lines.""" + assert wrap(text, w) == expected + + +@pytest.mark.parametrize('text,w,expected', [ + # max_lines truncation preserves OSC66 sequence atomically with truncated text + ('abc' + TS3 + 'def', 7, ['abc', TS3 + 'def']), + ('ab' + TS3 + 'cd', 6, ['ab', TS3 + 'cd']), +]) +def test_wrap_ts_max_lines(text, w, expected): + """max_lines truncation works correctly with OSC 66 sequences.""" + assert wrap(text, w, max_lines=2, placeholder='~') == expected diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 2205d77..ad43abf 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -47,7 +47,7 @@ # std imports from importlib import metadata as importlib_metadata except ImportError: # pragma: no cover - fallback for very old Pythons - importlib_metadata = None + importlib_metadata = None # type: ignore[assignment] if importlib_metadata is not None: try: diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py index 4b495e9..451b747 100644 --- a/wcwidth/text_sizing.py +++ b/wcwidth/text_sizing.py @@ -75,7 +75,7 @@ class TextSizingParams(typing.NamedTuple): vertical_align: int = 0 horizontal_align: int = 0 - def __repr__(self): + def __repr__(self) -> str: """ Return a compact representation including only non-default fields. diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 749ea58..a90453f 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -390,8 +390,6 @@ def clip( def _append_visible(s: str, w: int, start_col: int | None = None) -> None: nonlocal visible_count, sgr_at_clip_start - if w <= 0: - return if start_col is None: start_col = col prev = output_tokens[-1] if (output_tokens and output_tokens[-1][0] == 'vis') else None @@ -439,7 +437,9 @@ def _remove_visible_tail(n: int) -> None: # slice the string by grapheme widths kept_text = '' acc = 0 - for g in iter_graphemes(tok_s): + g_iter = iter_graphemes(tok_s) + while acc < keep_cols: + g = next(g_iter) gw = width(g, ambiguous_width=ambiguous_width) if acc + gw > keep_cols: break @@ -479,9 +479,8 @@ def _remove_visible_tail(n: int) -> None: if move_start < end and move_end > start: overlap_start = max(move_start, start) overlap_end = min(move_end, end) - overlap = overlap_end - overlap_start - if overlap > 0: - _append_visible(fillchar * overlap, overlap, overlap_start) + _append_visible(fillchar * (overlap_end - overlap_start), + overlap_end - overlap_start, overlap_start) col += n_left idx = match.end() continue @@ -568,38 +567,7 @@ def _remove_visible_tail(n: int) -> None: parts.append(tok[1]) else: # visible chunk: ('vis', text, width_in_cols, start_col) - _, text, tok_w, tok_start = tok - chunk_len = tok_w - chunk_start = tok_start - chunk_end = chunk_start + chunk_len - if chunk_end <= start: - continue - if chunk_start >= end: - continue - s0 = max(0, start - chunk_start) - s1 = min(chunk_len, end - chunk_start) - # slice `text` for columns [s0, s1) - acc = 0 - slice_text = '' - for g in iter_graphemes(text): - gw = width(g, ambiguous_width=ambiguous_width) - next_acc = acc + gw - if next_acc <= s0: - acc = next_acc - continue - if acc >= s1: - break - # include this grapheme (or part of it) - # graphemes are atomic; if they partially overlap, use fillchar instead - if acc < s0 or next_acc > s1: - # partial grapheme -> fill with appropriate number of fillchars - left = max(0, s0 - acc) - right = min(gw, s1 - acc) - slice_text += fillchar * (right - left) - else: - slice_text += g - acc = next_acc - parts.append(slice_text) + parts.append(tok[1]) result = ''.join(parts) @@ -687,11 +655,10 @@ def flush(): pending_texts.append(unit_text) else: flush() - if overlap > 0: - abs_start = col + max(unit_start, rel_start) - output_tokens.append(('vis', fillchar * overlap, overlap, abs_start)) - visible_count += overlap + abs_start = col + max(unit_start, rel_start) + output_tokens.append(('vis', fillchar * overlap, overlap, abs_start)) + visible_count += overlap pos = unit_end flush() - return col + ts_width, visible_count + return col + ts_width, visible_count \ No newline at end of file From 84d46a867678267e1994adc4cb9e11567f07887f Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Wed, 29 Apr 2026 22:06:13 -0400 Subject: [PATCH 27/70] use NamedTuple .. --- wcwidth/table_ambiguous.py | 3 +- wcwidth/table_grapheme.py | 3 +- wcwidth/table_mc.py | 3 +- wcwidth/table_vs16.py | 3 +- wcwidth/table_wide.py | 7 ++-- wcwidth/table_zero.py | 3 +- wcwidth/wcwidth.py | 75 ++++++++++++++++++++------------------ 7 files changed, 48 insertions(+), 49 deletions(-) diff --git a/wcwidth/table_ambiguous.py b/wcwidth/table_ambiguous.py index 2c40498..e3dc0b1 100644 --- a/wcwidth/table_ambiguous.py +++ b/wcwidth/table_ambiguous.py @@ -1,9 +1,8 @@ """ Exports AMBIGUOUS_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-01-18 23:27:15 UTC. """ - # pylint: disable=duplicate-code AMBIGUOUS_EASTASIAN = { '17.0.0': ( diff --git a/wcwidth/table_grapheme.py b/wcwidth/table_grapheme.py index d265b66..42fd19e 100644 --- a/wcwidth/table_grapheme.py +++ b/wcwidth/table_grapheme.py @@ -4,9 +4,8 @@ This module provides lookup tables for Unicode grapheme cluster break properties as defined in UAX #29: Unicode Text Segmentation. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-01-29 23:33:42 UTC. """ - # pylint: disable=duplicate-code GRAPHEME_CR = ( diff --git a/wcwidth/table_mc.py b/wcwidth/table_mc.py index 63acce9..7c2e691 100644 --- a/wcwidth/table_mc.py +++ b/wcwidth/table_mc.py @@ -1,9 +1,8 @@ """ Exports CATEGORY_MC table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-01-29 00:47:54 UTC. """ - # pylint: disable=duplicate-code CATEGORY_MC = { '17.0.0': ( diff --git a/wcwidth/table_vs16.py b/wcwidth/table_vs16.py index a5fc0a8..70e4a73 100644 --- a/wcwidth/table_vs16.py +++ b/wcwidth/table_vs16.py @@ -1,9 +1,8 @@ """ Exports VS16_NARROW_TO_WIDE table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. +This code generated by wcwidth/bin/update-tables.py on 2025-09-15 16:57:50 UTC. """ - # pylint: disable=duplicate-code VS16_NARROW_TO_WIDE = { '9.0.0': ( diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py index 9d2ebd5..ed6f48a 100644 --- a/wcwidth/table_wide.py +++ b/wcwidth/table_wide.py @@ -1,9 +1,8 @@ """ Exports WIDE_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:58:17 UTC. """ - # pylint: disable=duplicate-code WIDE_EASTASIAN = { '17.0.0': ( @@ -73,8 +72,8 @@ (0x0ffe0, 0x0ffe6,), # Fullwidth Cent Sign ..Fullwidth Won Sign (0x16fe0, 0x16fe3,), # Tangut Iteration Mark ..Old Chinese Iteration Ma (0x16ff2, 0x16ff6,), # Chinese Small Simplified..Yangqin Sign Slow Two Be - (0x17000, 0x18cd5,), # Tangut Ideograph-17000 ..Khitan Small Script Char - (0x18cff, 0x18d1e,), # Khitan Small Script Char..Tangut Ideograph-18d1e + (0x17000, 0x18cd5,), # (nil) ..Khitan Small Script Char + (0x18cff, 0x18d1e,), # Khitan Small Script Char..(nil) (0x18d80, 0x18df2,), # Tangut Component-769 ..Tangut Component-883 (0x1aff0, 0x1aff3,), # Katakana Letter Minnan T..Katakana Letter Minnan T (0x1aff5, 0x1affb,), # Katakana Letter Minnan T..Katakana Letter Minnan N diff --git a/wcwidth/table_zero.py b/wcwidth/table_zero.py index b669f70..c440bfc 100644 --- a/wcwidth/table_zero.py +++ b/wcwidth/table_zero.py @@ -1,9 +1,8 @@ """ Exports ZERO_WIDTH table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:48:24 UTC. """ - # pylint: disable=duplicate-code ZERO_WIDTH = { '17.0.0': ( diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index bb2daf3..400beff 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -63,8 +63,7 @@ # std imports from functools import lru_cache - -from typing import Literal +from typing import Literal, NamedTuple, Union # local # pylint: disable=unused-import @@ -102,11 +101,27 @@ strip_sequences) from .unicode_versions import list_versions -# Type aliases for output_tokens used by clip(). -# ('vis', text, width_in_cols, start_col) or ('seq', seq_text) -VisToken = tuple[Literal['vis'], str, int, int] -SeqToken = tuple[Literal['seq'], str] -Token = VisToken | SeqToken +# Token types for output_tokens used by clip(). +# NamedTuple subclasses provide named attribute access while remaining +# plain tuples at runtime — zero overhead over the old bare-tuple approach, +# but with isinstance() type discrimination and meaningful attribute names. + + +class VisToken(NamedTuple): + """A visible text segment with its display width and starting column.""" + + text: str + width: int + start_col: int + + +class SeqToken(NamedTuple): + """A zero-width terminal sequence (escape sequences, control chars, etc.).""" + + text: str + + +Token = Union[VisToken, SeqToken] # Unlike wcwidth.__all__, wcwidth.wcwidth.__all__ is NOT for the purpose of defining a public API, # or what we prefer to be imported with statement, "from wcwidth.wcwidth import *". Explicitly @@ -395,22 +410,19 @@ def _append_visible(s: str, w: int, start_col: int | None = None) -> None: return if start_col is None: start_col = col - prev = output_tokens[-1] if (output_tokens and output_tokens[-1][0] == 'vis') else None - if prev is not None and prev[3] + prev[2] == start_col: + prev = output_tokens[-1] if (output_tokens and isinstance(output_tokens[-1], VisToken)) else None + if prev is not None and prev.start_col + prev.width == start_col: # merge with previous contiguous visible token: append text and add widths - prev_s = prev[1] - prev_w = prev[2] - prev_start = prev[3] - output_tokens[-1] = ('vis', prev_s + s, prev_w + w, prev_start) + output_tokens[-1] = VisToken(prev.text + s, prev.width + w, prev.start_col) else: - output_tokens.append(('vis', s, w, start_col)) + output_tokens.append(VisToken(s, w, start_col)) visible_count += w if propagate_sgr and sgr_at_clip_start is None: sgr_at_clip_start = sgr def _append_seq(seq: str) -> None: nonlocal sgr_at_clip_start - output_tokens.append(('seq', seq)) + output_tokens.append(SeqToken(seq)) if propagate_sgr and sgr_at_clip_start is None: sgr_at_clip_start = sgr @@ -421,33 +433,29 @@ def _remove_visible_tail(n: int) -> None: while to_remove > 0 and visible_count > 0: # find last visible token i = len(output_tokens) - 1 - while i >= 0 and output_tokens[i][0] != 'vis': + while i >= 0 and not isinstance(output_tokens[i], VisToken): i -= 1 if i < 0: break tok = output_tokens[i] - assert tok[0] == 'vis' # guaranteed by while loop above - tok_s = tok[1] - tok_w = tok[2] - tok_start = tok[3] - if tok_w <= to_remove: + if tok.width <= to_remove: # remove entire token output_tokens.pop(i) - to_remove -= tok_w - visible_count -= tok_w + to_remove -= tok.width + visible_count -= tok.width else: # shorten token by removing columns from the end - keep_cols = tok_w - to_remove + keep_cols = tok.width - to_remove # slice the string by grapheme widths kept_text = '' acc = 0 - for g in iter_graphemes(tok_s): + for g in iter_graphemes(tok.text): gw = width(g, ambiguous_width=ambiguous_width) if acc + gw > keep_cols: break kept_text += g acc += gw - output_tokens[i] = ('vis', kept_text, acc, tok_start) + output_tokens[i] = VisToken(kept_text, acc, tok.start_col) visible_count -= to_remove to_remove = 0 @@ -552,24 +560,21 @@ def _remove_visible_tail(n: int) -> None: # Reconstruct result from output_tokens, slicing visible content to [start,end) parts: list[str] = [] for tok in output_tokens: - if tok[0] == 'seq': - parts.append(tok[1]) + if isinstance(tok, SeqToken): + parts.append(tok.text) else: - # visible chunk: ('vis', text, width_in_cols, start_col) - _, text, tok_w, tok_start = tok - chunk_len = tok_w - chunk_start = tok_start - chunk_end = chunk_start + chunk_len + chunk_start = tok.start_col + chunk_end = chunk_start + tok.width if chunk_end <= start: continue if chunk_start >= end: continue s0 = max(0, start - chunk_start) - s1 = min(chunk_len, end - chunk_start) + s1 = min(tok.width, end - chunk_start) # slice `text` for columns [s0, s1) acc = 0 slice_text = '' - for g in iter_graphemes(text): + for g in iter_graphemes(tok.text): gw = width(g, ambiguous_width=ambiguous_width) next_acc = acc + gw if next_acc <= s0: From fab8771855272456c6ee268a68401f145460b0c0 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Wed, 29 Apr 2026 22:33:03 -0400 Subject: [PATCH 28/70] checkpoint --- docs/api.rst | 4 + tests/test_clip_cursors.py | 13 ++- tests/test_textwrap.py | 33 ++++++++ wcwidth/__init__.py | 3 +- wcwidth/_width.py | 9 ++- wcwidth/escape_sequences.py | 17 +++- wcwidth/table_ambiguous.py | 3 +- wcwidth/table_grapheme.py | 3 +- wcwidth/table_mc.py | 3 +- wcwidth/table_vs16.py | 3 +- wcwidth/table_wide.py | 7 +- wcwidth/table_zero.py | 3 +- wcwidth/wcwidth.py | 157 ++++++++++++++++++++++++++---------- 13 files changed, 204 insertions(+), 54 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 55d288b..a80eb40 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -36,4 +36,8 @@ requirements.txt or equivalent. Their signatures will never change. .. autofunction:: wcwidth.list_versions +.. autofunction:: wcwidth.TextSizing + +.. autofunction:: wcwidth.TextSizingParams + .. _SEMVER: https://semver.org diff --git a/tests/test_clip_cursors.py b/tests/test_clip_cursors.py index 4e59343..a357126 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_cursors.py @@ -19,6 +19,17 @@ ("hello\x1b[10Cworld", 0, 5, "hello"), # Cursor-left overwrites previous characters ("hello\x1b[2DXY", 0, 5, "helXY"), + # Cursor-left that removes entire visible token (tok_w <= to_remove path) + ("abc\x1b[3DXY", 0, 5, "XY"), + # Cursor-left at column 0 (prev_col not > col, no overwrite) + ("\x1b[2Dhi", 0, 2, "hi"), + # Cursor-left with no visible tokens emitted (to_remove <= 0 path) + ("\x1b[5C\x1b[2Dhi", 5, 7, ""), + # Cursor-left triggers _remove_visible_tail with seq tokens before vis token + # exercises the inner while loop that skips past seq tokens (line 422) + ("ab\x1b]8;;http://x.com\x07\x1b[2Dcd", 0, 4, "\x1b]8;;http://x.com\x07cd"), + # Cursor-left into wide char twice, second time on empty token triggers i < 0 break + ("中\x1b[D\x1b[Da", 0, 4, "a"), ('ab\x1b[5Ccd', 0, 4, 'ab '), ('abcde\x1b[2Df', 0, 6, 'abcf'), ('ab\x1b[10Ccd', 0, 4, 'ab '), @@ -46,4 +57,4 @@ def test_clip_cursor_sequences_expected_behaviour(text, start, end, expected): the clipped output if the moved-to columns are within the clip window; cursor-left allows subsequent characters to overwrite previous content and the clip should reflect that. """ - assert repr(clip(text, start, end)) == repr(expected) + assert repr(clip(text, start, end)) == repr(expected) \ No newline at end of file diff --git a/tests/test_textwrap.py b/tests/test_textwrap.py index ae2eb2d..b264472 100644 --- a/tests/test_textwrap.py +++ b/tests/test_textwrap.py @@ -485,3 +485,36 @@ def test_wrap_replace_whitespace_false_newlines_zero_width(): """Newlines have zero display width, so more text fits per line than stdlib.""" assert wrap('hello\nworld foo\nbar', 10, replace_whitespace=False) == [ 'hello\nworld', 'foo\nbar'] + + +# kitty text sizing protocol (OSC 66) constants for wrap() tests. +# Width calculation, BEL/ST/scale/auto-width/CJK/SGR interaction with OSC66 are +# already covered exhaustively in test_text_sizing.py and test_clip_*.py. These +# tests verify only the *line-breaking* behaviour that is unique to wrap(). +TS3 = '\x1b]66;w=3;XYZ\x07' # explicit width=3 + + +@pytest.mark.parametrize('text,w,expected', [ + # Greedy fill: atomic sequence moves to next line when line width exceeded + ('abc' + TS3 + 'def', 4, ['abc' + TS3 + 'd', 'ef']), + ('abc' + TS3 + 'def', 5, ['abc' + TS3 + 'de', 'f']), + ('abc' + TS3 + 'def', 6, ['abc', TS3 + 'def']), + ('abc' + TS3 + 'def', 8, ['abc', TS3 + 'def']), + ('abc' + TS3 + 'def', 10, ['abc' + TS3 + 'def']), + # Sequence stays with preceding word when total stripped width fits + ('aa' + TS3 + 'bb', 5, ['aa', TS3 + 'bb']), + ('pre' + TS3 + 'post', 8, ['pre', TS3 + 'post']), +]) +def test_wrap_ts_line_fill(text, w, expected): + """OSC 66 sequence width is respected and treated as atomic unit when filling lines.""" + assert wrap(text, w) == expected + + +@pytest.mark.parametrize('text,w,expected', [ + # max_lines truncation preserves OSC66 sequence atomically with truncated text + ('abc' + TS3 + 'def', 7, ['abc', TS3 + 'def']), + ('ab' + TS3 + 'cd', 6, ['ab', TS3 + 'cd']), +]) +def test_wrap_ts_max_lines(text, w, expected): + """max_lines truncation works correctly with OSC 66 sequences.""" + assert wrap(text, w, max_lines=2, placeholder='~') == expected diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index fe9f002..ad43abf 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -31,13 +31,14 @@ from .grapheme import iter_graphemes, iter_graphemes_reverse from .textwrap import SequenceTextWrapper, wrap from .sgr_state import propagate_sgr +from .text_sizing import TextSizing, TextSizingParams # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". __all__ = ('wcwidth', 'wcswidth', 'width', 'iter_sequences', 'iter_graphemes', 'iter_graphemes_reverse', 'grapheme_boundary_before', 'ljust', 'rjust', 'center', 'wrap', 'clip', 'strip_sequences', - 'list_versions', 'propagate_sgr') + 'list_versions', 'propagate_sgr', 'TextSizing', 'TextSizingParams') # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places diff --git a/wcwidth/_width.py b/wcwidth/_width.py index 90105c1..9dc68ed 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -12,9 +12,11 @@ _FITZPATRICK_RANGE, _REGIONAL_INDICATOR_SET) from .table_vs16 import VS16_NARROW_TO_WIDE +from .text_sizing import TextSizing from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL from .table_grapheme import ISC_CONSONANT from .escape_sequences import (ZERO_WIDTH_PATTERN, + TEXT_SIZING_PATTERN, CURSOR_LEFT_SEQUENCE, CURSOR_RIGHT_SEQUENCE, INDETERMINATE_EFFECT_SEQUENCE, @@ -124,7 +126,8 @@ def width( # Check for escape sequences that can't be ignored, if present if '\x1b' not in text or ( not CURSOR_RIGHT_SEQUENCE.search(text) and - not CURSOR_LEFT_SEQUENCE.search(text) + not CURSOR_LEFT_SEQUENCE.search(text) and + not TEXT_SIZING_PATTERN.search(text) ): control_codes = 'ignore' @@ -166,6 +169,10 @@ def width( elif (left := CURSOR_LEFT_SEQUENCE.match(seq)): current_col = max(0, current_col - int(left.group(1) or 1)) + # Or OSC 66 (kitty text sizing) + elif (ts_match := TEXT_SIZING_PATTERN.match(seq)): + text_size = TextSizing.from_match(ts_match, control_codes=control_codes) + current_col += text_size.display_width(ambiguous_width) idx = match.end() else: # Errant ESC or unknown sequence: only the first character is zero-width diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index 67f6a63..e9d5734 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -11,12 +11,20 @@ import typing +# Text Sizing Protocol (OSC 66) — has positive width, must be checked before ZERO_WIDTH_PATTERN. +# Groups: (1) metadata, (2) inner text, (3) terminator (BEL or ST). +# https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ +TEXT_SIZING_PATTERN = re.compile( + r'\x1b\]66;([^;\x07\x1b]*);([^\x07\x1b]*)(\x07|\x1b\\)' +) + # Zero-width escape sequences (SGR, OSC, CSI, etc.). This table, like INDETERMINATE_EFFECT_SEQUENCE, # originated from the 'blessed' library. ZERO_WIDTH_PATTERN = re.compile( # CSI sequences r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]|' - # OSC sequences + # OSC sequences, note that text sizing protocol (OSC 66) is special case in width() and clip(), + # and contrary to the variable name, it is positive width. r'\x1b\][^\x07\x1b]*(?:\x07|\x1b\\)|' # APC sequences r'\x1b_[^\x1b\x07]*(?:\x07|\x1b\\)|' @@ -128,6 +136,9 @@ def strip_sequences(text: str) -> str: r""" Return text with all terminal escape sequences removed. + For sequences containing printable text, OSC 66 (Text sizing protocol) and OSC 8 (hyperlink), + the inner text is preserved. + Unknown or incomplete ESC sequences are preserved. :param text: String that may contain terminal escape sequences. @@ -143,7 +154,11 @@ def strip_sequences(text: str) -> str: 'hello' >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text') 'bold red text' + >>> strip_sequences('\x1b]66;s=2;hello\x07') + 'hello' >>> strip_sequences('\x1b]8;id=34;https://example.com\x1b\\[view]\x1b]8;;\x1b\\') '[view]' """ + if '\x1b]66;' in text: + text = TEXT_SIZING_PATTERN.sub(r'\2', text) return ZERO_WIDTH_PATTERN.sub('', text) diff --git a/wcwidth/table_ambiguous.py b/wcwidth/table_ambiguous.py index e3dc0b1..2c40498 100644 --- a/wcwidth/table_ambiguous.py +++ b/wcwidth/table_ambiguous.py @@ -1,8 +1,9 @@ """ Exports AMBIGUOUS_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-18 23:27:15 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. """ + # pylint: disable=duplicate-code AMBIGUOUS_EASTASIAN = { '17.0.0': ( diff --git a/wcwidth/table_grapheme.py b/wcwidth/table_grapheme.py index 42fd19e..d265b66 100644 --- a/wcwidth/table_grapheme.py +++ b/wcwidth/table_grapheme.py @@ -4,8 +4,9 @@ This module provides lookup tables for Unicode grapheme cluster break properties as defined in UAX #29: Unicode Text Segmentation. -This code generated by wcwidth/bin/update-tables.py on 2026-01-29 23:33:42 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ + # pylint: disable=duplicate-code GRAPHEME_CR = ( diff --git a/wcwidth/table_mc.py b/wcwidth/table_mc.py index 7c2e691..63acce9 100644 --- a/wcwidth/table_mc.py +++ b/wcwidth/table_mc.py @@ -1,8 +1,9 @@ """ Exports CATEGORY_MC table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-29 00:47:54 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ + # pylint: disable=duplicate-code CATEGORY_MC = { '17.0.0': ( diff --git a/wcwidth/table_vs16.py b/wcwidth/table_vs16.py index 70e4a73..a5fc0a8 100644 --- a/wcwidth/table_vs16.py +++ b/wcwidth/table_vs16.py @@ -1,8 +1,9 @@ """ Exports VS16_NARROW_TO_WIDE table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2025-09-15 16:57:50 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. """ + # pylint: disable=duplicate-code VS16_NARROW_TO_WIDE = { '9.0.0': ( diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py index ed6f48a..9d2ebd5 100644 --- a/wcwidth/table_wide.py +++ b/wcwidth/table_wide.py @@ -1,8 +1,9 @@ """ Exports WIDE_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:58:17 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ + # pylint: disable=duplicate-code WIDE_EASTASIAN = { '17.0.0': ( @@ -72,8 +73,8 @@ (0x0ffe0, 0x0ffe6,), # Fullwidth Cent Sign ..Fullwidth Won Sign (0x16fe0, 0x16fe3,), # Tangut Iteration Mark ..Old Chinese Iteration Ma (0x16ff2, 0x16ff6,), # Chinese Small Simplified..Yangqin Sign Slow Two Be - (0x17000, 0x18cd5,), # (nil) ..Khitan Small Script Char - (0x18cff, 0x18d1e,), # Khitan Small Script Char..(nil) + (0x17000, 0x18cd5,), # Tangut Ideograph-17000 ..Khitan Small Script Char + (0x18cff, 0x18d1e,), # Khitan Small Script Char..Tangut Ideograph-18d1e (0x18d80, 0x18df2,), # Tangut Component-769 ..Tangut Component-883 (0x1aff0, 0x1aff3,), # Katakana Letter Minnan T..Katakana Letter Minnan T (0x1aff5, 0x1affb,), # Katakana Letter Minnan T..Katakana Letter Minnan N diff --git a/wcwidth/table_zero.py b/wcwidth/table_zero.py index c440bfc..b669f70 100644 --- a/wcwidth/table_zero.py +++ b/wcwidth/table_zero.py @@ -1,8 +1,9 @@ """ Exports ZERO_WIDTH table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:48:24 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. """ + # pylint: disable=duplicate-code ZERO_WIDTH = { '17.0.0': ( diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 400beff..4d65067 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -90,10 +90,12 @@ from .table_vs16 import VS16_NARROW_TO_WIDE from .table_wide import WIDE_EASTASIAN from .table_zero import ZERO_WIDTH +from .text_sizing import TextSizing, TextSizingParams from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL from .table_grapheme import ISC_CONSONANT from .table_ambiguous import AMBIGUOUS_EASTASIAN from .escape_sequences import (ZERO_WIDTH_PATTERN, + TEXT_SIZING_PATTERN, CURSOR_LEFT_SEQUENCE, CURSOR_RIGHT_SEQUENCE, INDETERMINATE_EFFECT_SEQUENCE, @@ -362,6 +364,9 @@ def clip( .. versionchanged:: 0.5.0 Added ``propagate_sgr`` parameter (default True). + .. versionchanged:: 0.6.1 + Parses OSC 66 Sequences. + Example:: >>> clip('hello world', 0, 5) @@ -394,11 +399,9 @@ def clip( if propagate_sgr: sgr = _SGR_STATE_DEFAULT - # output_tokens stores tuples ('vis', text) for visible content and ('seq', seq) - # for preserved zero-width sequences. This allows cursor-left overwrites to - # remove previously emitted visible characters while keeping the sequence order. - # For visible tokens we store ('vis', text, width_in_columns) - # For sequences we store ('seq', seq) + # output_tokens stores VisToken for visible content and SeqToken for preserved + # zero-width sequences. This allows cursor-left overwrites to remove previously + # emitted visible characters while keeping the sequence order. output_tokens: list[Token] = [] visible_count = 0 # number of visible columns emitted so far col = 0 @@ -406,8 +409,6 @@ def clip( def _append_visible(s: str, w: int, start_col: int | None = None) -> None: nonlocal visible_count, sgr_at_clip_start - if w <= 0: - return if start_col is None: start_col = col prev = output_tokens[-1] if (output_tokens and isinstance(output_tokens[-1], VisToken)) else None @@ -449,7 +450,9 @@ def _remove_visible_tail(n: int) -> None: # slice the string by grapheme widths kept_text = '' acc = 0 - for g in iter_graphemes(tok.text): + g_iter = iter_graphemes(tok.text) + while acc < keep_cols: + g = next(g_iter) gw = width(g, ambiguous_width=ambiguous_width) if acc + gw > keep_cols: break @@ -489,9 +492,8 @@ def _remove_visible_tail(n: int) -> None: if move_start < end and move_end > start: overlap_start = max(move_start, start) overlap_end = min(move_end, end) - overlap = overlap_end - overlap_start - if overlap > 0: - _append_visible(fillchar * overlap, overlap, overlap_start) + _append_visible(fillchar * (overlap_end - overlap_start), + overlap_end - overlap_start, overlap_start) col += n_left idx = match.end() continue @@ -509,6 +511,21 @@ def _remove_visible_tail(n: int) -> None: _remove_visible_tail(to_remove) idx = match.end() continue + + if (ts_match := TEXT_SIZING_PATTERN.match(seq)): + # OSC 66 (text sizing) has positive width + col, visible_count = _text_sizing_clip( + TextSizing.from_match(ts_match), + col=col, start=start, end=end, + output_tokens=output_tokens, + visible_count=visible_count, + fillchar=fillchar, ambiguous_width=ambiguous_width, + ) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + idx = match.end() + continue + # Other zero-width sequences (OSC hyperlinks, etc.) — preserve as-is _append_seq(seq) idx = match.end() @@ -557,42 +574,15 @@ def _remove_visible_tail(n: int) -> None: idx += len(grapheme) - # Reconstruct result from output_tokens, slicing visible content to [start,end) + # Reconstruct result from output_tokens. The emission phase guarantees that + # all visible tokens are fully within the clip window, so no sub-token slicing + # or boundary checks are needed here. parts: list[str] = [] for tok in output_tokens: if isinstance(tok, SeqToken): parts.append(tok.text) else: - chunk_start = tok.start_col - chunk_end = chunk_start + tok.width - if chunk_end <= start: - continue - if chunk_start >= end: - continue - s0 = max(0, start - chunk_start) - s1 = min(tok.width, end - chunk_start) - # slice `text` for columns [s0, s1) - acc = 0 - slice_text = '' - for g in iter_graphemes(tok.text): - gw = width(g, ambiguous_width=ambiguous_width) - next_acc = acc + gw - if next_acc <= s0: - acc = next_acc - continue - if acc >= s1: - break - # include this grapheme (or part of it) - # graphemes are atomic; if they partially overlap, use fillchar instead - if acc < s0 or next_acc > s1: - # partial grapheme -> fill with appropriate number of fillchars - left = max(0, s0 - acc) - right = min(gw, s1 - acc) - slice_text += fillchar * (right - left) - else: - slice_text += g - acc = next_acc - parts.append(slice_text) + parts.append(tok.text) result = ''.join(parts) @@ -604,3 +594,86 @@ def _remove_visible_tail(n: int) -> None: result += '\x1b[0m' return result + + +def _text_sizing_clip( + ts: TextSizing, + *, + col: int, + start: int, + end: int, + output_tokens: list[Token], + visible_count: int, + fillchar: str = ' ', + ambiguous_width: int = 1, +) -> tuple[int, int]: + """ + Emit tokens for a text-sizing sequence into ``output_tokens``, clipped to ``[start, end)``. + + Returns ``(new_col, new_visible_count)``. + + This was formerly ``TextSizing.clip()`` in :mod:`wcwidth.text_sizing`. It was moved here to + break a circular dependency loop (:mod:`text_sizing` imported :mod:`_width`, and :mod:`_width` + imported :mod:`text_sizing`). + """ + # pylint: disable=too-many-locals + ts_width = ts.display_width(ambiguous_width) + if col >= start and col + ts_width <= end: + output_tokens.append(SeqToken(ts.make_sequence())) + return col + ts_width, visible_count + if col >= end or col + ts_width <= start: + return col + ts_width, visible_count + + # Partial overlap: decompose into units (graphemes at `scale` cells each), + # emit whole units as sequences and partial units as fillchars. + rel_start = max(0, start - col) + rel_end = min(end, col + ts_width) - col + scale = ts.params.scale + + units: list[tuple[str, int]] = [] + if ts.params.width > 0: + inner_graphemes = list(iter_graphemes(ts.text)) + for j in range(ts.params.width): + g = inner_graphemes[j] if j < len(inner_graphemes) else '' + units.append((g, scale)) + else: + for g in iter_graphemes(ts.text): + units.append((g, width(g, ambiguous_width=ambiguous_width) * scale)) + + pos = 0 + pending_texts: list[str] = [] + + def flush(): + if not pending_texts: + return + params = TextSizingParams( + scale, + len(pending_texts) if ts.params.width > 0 else 0, + ts.params.numerator, + ts.params.denominator, + ts.params.vertical_align, + ts.params.horizontal_align) + output_tokens.append( + SeqToken(TextSizing(params, ''.join(pending_texts), ts.terminator).make_sequence())) + pending_texts.clear() + + for unit_text, unit_w in units: + unit_start = pos + unit_end = pos + unit_w + if unit_end <= rel_start: + pos = unit_end + continue + if unit_start >= rel_end: + break + overlap = min(unit_end, rel_end) - max(unit_start, rel_start) + if overlap == unit_w and unit_w > 0: + pending_texts.append(unit_text) + else: + flush() + abs_start = col + max(unit_start, rel_start) + output_tokens.append(VisToken(fillchar * overlap, overlap, abs_start)) + visible_count += overlap + pos = unit_end + + flush() + return col + ts_width, visible_count \ No newline at end of file From a01f045896a10ad183ac00ff975660eed6fbc139 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 00:37:08 -0400 Subject: [PATCH 29/70] allow clip() to better translate CURSOR_LEFT, +text_size ``hello\x1b[5Dw`` *WAS* 'w', but it should be 'wello' and is done in this commit using "Painter's Algorithm" like what is done in graphics engines, that we draw "back to front" and look at what is remaining, here by dictionary key as horizontal cursor position --- tests/test_clip_cursors.py | 16 +-- wcwidth/wcwidth.py | 217 +++++++++++++++++++++---------------- 2 files changed, 129 insertions(+), 104 deletions(-) diff --git a/tests/test_clip_cursors.py b/tests/test_clip_cursors.py index a357126..f19e8e9 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_cursors.py @@ -19,19 +19,19 @@ ("hello\x1b[10Cworld", 0, 5, "hello"), # Cursor-left overwrites previous characters ("hello\x1b[2DXY", 0, 5, "helXY"), - # Cursor-left that removes entire visible token (tok_w <= to_remove path) - ("abc\x1b[3DXY", 0, 5, "XY"), + # Cursor-left overwrites entire visible token + ("abc\x1b[3DXY", 0, 5, "XYc"), # Cursor-left at column 0 (prev_col not > col, no overwrite) ("\x1b[2Dhi", 0, 2, "hi"), - # Cursor-left with no visible tokens emitted (to_remove <= 0 path) + # Cursor-left with no visible tokens emitted ("\x1b[5C\x1b[2Dhi", 5, 7, ""), - # Cursor-left triggers _remove_visible_tail with seq tokens before vis token - # exercises the inner while loop that skips past seq tokens (line 422) - ("ab\x1b]8;;http://x.com\x07\x1b[2Dcd", 0, 4, "\x1b]8;;http://x.com\x07cd"), + # Cursor-left overwrites text, seq tokens preserve column spatial order + ("ab\x1b]8;;http://x.com\x07\x1b[2Dcd", 0, 4, "cd\x1b]8;;http://x.com\x07"), # Cursor-left into wide char twice, second time on empty token triggers i < 0 break - ("中\x1b[D\x1b[Da", 0, 4, "a"), + ("中\x1b[D\x1b[Da", 0, 4, "a "), ('ab\x1b[5Ccd', 0, 4, 'ab '), - ('abcde\x1b[2Df', 0, 6, 'abcf'), + ('abcde\x1b[2Df', 0, 6, 'abcfe'), + ('hello\x1b[5Dw', 0, 5, 'wello'), ('ab\x1b[10Ccd', 0, 4, 'ab '), ('XY\x1b[Czy', 0, 4, 'XY z'), ('XY\x1b[Czy', 0, 5, 'XY zy'), diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 4d65067..b969fb8 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -103,6 +103,7 @@ strip_sequences) from .unicode_versions import list_versions + # Token types for output_tokens used by clip(). # NamedTuple subclasses provide named attribute access while remaining # plain tuples at runtime — zero overhead over the old bare-tuple approach, @@ -399,69 +400,50 @@ def clip( if propagate_sgr: sgr = _SGR_STATE_DEFAULT - # output_tokens stores VisToken for visible content and SeqToken for preserved - # zero-width sequences. This allows cursor-left overwrites to remove previously - # emitted visible characters while keeping the sequence order. - output_tokens: list[Token] = [] - visible_count = 0 # number of visible columns emitted so far + # Painter's algorithm data structures: + # 1. cells: maps column integer to a visible character (with its width) + # cells that are part of a wide character's right half are not populated. + # 2. sequences: maps column integer to a list of zero-width sequences emitted at that position + # and their chronological order number. + cells: dict[int, tuple[str, int]] = {} + sequences: list[tuple[int, int, str]] = [] # (col, seq_order, text) + seq_order = 0 # relative ordering of sequences + col = 0 idx = 0 - def _append_visible(s: str, w: int, start_col: int | None = None) -> None: - nonlocal visible_count, sgr_at_clip_start - if start_col is None: - start_col = col - prev = output_tokens[-1] if (output_tokens and isinstance(output_tokens[-1], VisToken)) else None - if prev is not None and prev.start_col + prev.width == start_col: - # merge with previous contiguous visible token: append text and add widths - output_tokens[-1] = VisToken(prev.text + s, prev.width + w, prev.start_col) - else: - output_tokens.append(VisToken(s, w, start_col)) - visible_count += w + def _write_cells(s: str, w: int, write_col: int) -> None: + nonlocal sgr_at_clip_start + if w > 0: + # 1. Overwriting the right half of a wide char leaves left half as fillchar + if write_col > 0 and write_col - 1 in cells and cells[write_col - 1][1] == 2: + cells[write_col - 1] = (fillchar, 1) + + # 2. Overwriting the left half of a wide char leaves right half as fillchar + if w == 1 and write_col in cells and cells[write_col][1] == 2: + cells[write_col + 1] = (fillchar, 1) + + if w == 2 and write_col + 1 in cells and cells[write_col + 1][1] == 2: + cells[write_col + 2] = (fillchar, 1) + + # 3. Clean up the cells we are fully overwriting + for i in range(w): + if write_col + i in cells: + del cells[write_col + i] + + cells[write_col] = (s, w) + if propagate_sgr and sgr_at_clip_start is None: sgr_at_clip_start = sgr - def _append_seq(seq: str) -> None: - nonlocal sgr_at_clip_start - output_tokens.append(SeqToken(seq)) + def _append_seq(seq: str, at_col: int | None = None) -> None: + nonlocal sgr_at_clip_start, seq_order + c = col if at_col is None else at_col + sequences.append((c, seq_order, seq)) + seq_order += 1 if propagate_sgr and sgr_at_clip_start is None: sgr_at_clip_start = sgr - def _remove_visible_tail(n: int) -> None: - """Remove n visible columns from the end of output_tokens (overwrite semantics).""" - nonlocal visible_count - to_remove = n - while to_remove > 0 and visible_count > 0: - # find last visible token - i = len(output_tokens) - 1 - while i >= 0 and not isinstance(output_tokens[i], VisToken): - i -= 1 - if i < 0: - break - tok = output_tokens[i] - if tok.width <= to_remove: - # remove entire token - output_tokens.pop(i) - to_remove -= tok.width - visible_count -= tok.width - else: - # shorten token by removing columns from the end - keep_cols = tok.width - to_remove - # slice the string by grapheme widths - kept_text = '' - acc = 0 - g_iter = iter_graphemes(tok.text) - while acc < keep_cols: - g = next(g_iter) - gw = width(g, ambiguous_width=ambiguous_width) - if acc + gw > keep_cols: - break - kept_text += g - acc += gw - output_tokens[i] = VisToken(kept_text, acc, tok.start_col) - visible_count -= to_remove - to_remove = 0 - while idx < len(text): char = text[idx] @@ -492,8 +474,8 @@ def _remove_visible_tail(n: int) -> None: if move_start < end and move_end > start: overlap_start = max(move_start, start) overlap_end = min(move_end, end) - _append_visible(fillchar * (overlap_end - overlap_start), - overlap_end - overlap_start, overlap_start) + for i in range(overlap_start, overlap_end): + _write_cells(fillchar, 1, i) col += n_left idx = match.end() continue @@ -501,24 +483,16 @@ def _remove_visible_tail(n: int) -> None: if (match_cright := CURSOR_LEFT_SEQUENCE.match(seq)): digit_txt = match_cright.group(1) n_right = int(digit_txt) if digit_txt else 1 - prev_col = col col = max(0, col - n_right) - # If we moved left and had emitted visible columns beyond - # the new col, they are now potentially overwritten. - if prev_col > col: - to_remove = min(prev_col - col, visible_count) - if to_remove > 0: - _remove_visible_tail(to_remove) idx = match.end() continue if (ts_match := TEXT_SIZING_PATTERN.match(seq)): # OSC 66 (text sizing) has positive width - col, visible_count = _text_sizing_clip( + col = _text_sizing_clip( TextSizing.from_match(ts_match), col=col, start=start, end=end, - output_tokens=output_tokens, - visible_count=visible_count, + write_cells=_write_cells, fillchar=fillchar, ambiguous_width=ambiguous_width, ) if propagate_sgr and sgr_at_clip_start is None: @@ -543,7 +517,7 @@ def _remove_visible_tail(n: int) -> None: next_tab = col + (tabsize - (col % tabsize)) while col < next_tab: if start <= col < end: - _append_visible(' ', 1) + _write_cells(' ', 1, col) col += 1 else: # preserve tab as-is @@ -561,28 +535,78 @@ def _remove_visible_tail(n: int) -> None: _append_seq(grapheme) elif col >= start and col + grapheme_w <= end: # Fully visible - _append_visible(grapheme, grapheme_w) + _write_cells(grapheme, grapheme_w, col) col += grapheme_w elif col < end and col + grapheme_w > start: # Partially visible (wide char at boundary) -> emit fillchars for visible portion overlap = min(end, col + grapheme_w) - max(start, col) abs_start = max(start, col) - _append_visible(fillchar * overlap, overlap, abs_start) + for i in range(overlap): + _write_cells(fillchar, 1, abs_start + i) col += grapheme_w else: col += grapheme_w idx += len(grapheme) - # Reconstruct result from output_tokens. The emission phase guarantees that - # all visible tokens are fully within the clip window, so no sub-token slicing - # or boundary checks are needed here. + # Reconstruct result from painter's algorithm grid. parts: list[str] = [] - for tok in output_tokens: - if isinstance(tok, SeqToken): - parts.append(tok.text) + + seqs_by_col: dict[int, list[tuple[int, str]]] = {} + for seq_col, seq_ord, seq_text in sequences: + if seq_col not in seqs_by_col: + seqs_by_col[seq_col] = [] + seqs_by_col[seq_col].append((seq_ord, seq_text)) + + for c in seqs_by_col: + seqs_by_col[c].sort() + + max_col = max((max(cells.keys()) + 1 if cells else 0), + (max(seqs_by_col.keys()) if seqs_by_col else 0)) + + walk_col = 0 + # walk_col reaches exactly up to end, to ensure sequences at `end` are processed + while walk_col <= max_col and walk_col <= end: + if walk_col in seqs_by_col: + for _, seq_text in seqs_by_col[walk_col]: + parts.append(seq_text) + + if walk_col >= end: + walk_col += 1 + continue + + if walk_col in cells: + cell_text, cell_w = cells[walk_col] + + # Calculate overlap with [start, end) + cell_start = walk_col + cell_end = walk_col + cell_w + + overlap_start = max(start, cell_start) + overlap_end = min(end, cell_end) + + if overlap_start < overlap_end: + if cell_start >= start and cell_end <= end: + # Fully inside + parts.append(cell_text) + else: + # Partially inside (split wide char) + parts.append(fillchar * (overlap_end - overlap_start)) + + walk_col += cell_w else: - parts.append(tok.text) + # It's a hole. Only emit fillchar if we are inside the clip window + # AND if we are within the bounds of where visible text was written. + if walk_col >= start and cells and walk_col < max(cells.keys()) + 1: + parts.append(fillchar) + walk_col += 1 + + # Append any remaining sequences that occurred past the clip end boundary + # This preserves SGR resets and trailing hyperlinks if the loop broke early + for c in sorted(seqs_by_col.keys()): + if c > end or (c == end and c > max_col): + for _, seq_text in seqs_by_col[c]: + parts.append(seq_text) result = ''.join(parts) @@ -602,27 +626,22 @@ def _text_sizing_clip( col: int, start: int, end: int, - output_tokens: list[Token], - visible_count: int, + write_cells, fillchar: str = ' ', ambiguous_width: int = 1, -) -> tuple[int, int]: +) -> int: """ - Emit tokens for a text-sizing sequence into ``output_tokens``, clipped to ``[start, end)``. - - Returns ``(new_col, new_visible_count)``. + Emit tokens for a text-sizing sequence, clipped to ``[start, end)``. - This was formerly ``TextSizing.clip()`` in :mod:`wcwidth.text_sizing`. It was moved here to - break a circular dependency loop (:mod:`text_sizing` imported :mod:`_width`, and :mod:`_width` - imported :mod:`text_sizing`). + Returns ``new_col``. """ # pylint: disable=too-many-locals ts_width = ts.display_width(ambiguous_width) if col >= start and col + ts_width <= end: - output_tokens.append(SeqToken(ts.make_sequence())) - return col + ts_width, visible_count + write_cells(ts.make_sequence(), ts_width, col) + return col + ts_width if col >= end or col + ts_width <= start: - return col + ts_width, visible_count + return col + ts_width # Partial overlap: decompose into units (graphemes at `scale` cells each), # emit whole units as sequences and partial units as fillchars. @@ -643,7 +662,7 @@ def _text_sizing_clip( pos = 0 pending_texts: list[str] = [] - def flush(): + def flush(flush_col: int, flush_width: int): if not pending_texts: return params = TextSizingParams( @@ -653,10 +672,11 @@ def flush(): ts.params.denominator, ts.params.vertical_align, ts.params.horizontal_align) - output_tokens.append( - SeqToken(TextSizing(params, ''.join(pending_texts), ts.terminator).make_sequence())) + write_cells(TextSizing(params, ''.join(pending_texts), ts.terminator).make_sequence(), flush_width, flush_col) pending_texts.clear() + flush_col_pos = col + rel_start + flush_width = 0 for unit_text, unit_w in units: unit_start = pos unit_end = pos + unit_w @@ -667,13 +687,18 @@ def flush(): break overlap = min(unit_end, rel_end) - max(unit_start, rel_start) if overlap == unit_w and unit_w > 0: + if not pending_texts: + flush_col_pos = col + max(unit_start, rel_start) + flush_width = 0 pending_texts.append(unit_text) + flush_width += overlap else: - flush() + flush(flush_col_pos, flush_width) + flush_width = 0 abs_start = col + max(unit_start, rel_start) - output_tokens.append(VisToken(fillchar * overlap, overlap, abs_start)) - visible_count += overlap + for i in range(overlap): + write_cells(fillchar, 1, abs_start + i) pos = unit_end - flush() - return col + ts_width, visible_count \ No newline at end of file + flush(flush_col_pos, flush_width) + return col + ts_width From 29e954858722568270003c41ed945a9468434676 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 00:39:18 -0400 Subject: [PATCH 30/70] narf, go ahead commit text sizing here, too --- tests/test_text_sizing.py | 279 ++++++++++++++++++++++++++++++++++++++ wcwidth/text_sizing.py | 196 ++++++++++++++++++++++++++ 2 files changed, 475 insertions(+) create mode 100644 tests/test_text_sizing.py create mode 100644 wcwidth/text_sizing.py diff --git a/tests/test_text_sizing.py b/tests/test_text_sizing.py new file mode 100644 index 0000000..ff634ea --- /dev/null +++ b/tests/test_text_sizing.py @@ -0,0 +1,279 @@ +"""Tests for Text Sizing Protocol (OSC 66) support.""" + +# 3rd party +import pytest + +# local +from wcwidth import (TextSizing, + TextSizingParams, + clip, + width, + wcswidth, + iter_sequences, + strip_sequences) +from wcwidth.text_sizing import TEXT_FIELD_MAPPING +from wcwidth.escape_sequences import TEXT_SIZING_PATTERN + +_W_HI = TEXT_FIELD_MAPPING['w'].high +_N_HI = TEXT_FIELD_MAPPING['n'].high +_D_HI = TEXT_FIELD_MAPPING['d'].high + +CONTROL_CODES_PARAMS_CASES = [ + ('x=2', "", "Unknown text sizing field 'x' in "), + ('s=3:x=3', "s=3", "Unknown text sizing field 'x' in "), + ('s=2:x=3:w=9', f"s=2:w={_W_HI}", "Unknown text sizing field 'x' in "), + ('xyz=2', "", "Unknown text sizing field 'xyz' in "), + ('xxx', "", "Expected '=' in text sizing parameter"), + ('s=xxx', "", "Illegal text sizing value 'xxx' in "), + ('s=-99', "", "Out of bounds text sizing value '-99' in "), + ('s=99', f"s={_W_HI}", "Out of bounds text sizing value '99' in "), + ('w=-1', "", "Out of bounds text sizing value '-1' in "), + ('w=8', f"w={_W_HI}", "Out of bounds text sizing value '8' in "), + ('n=20', f"n={_N_HI}", "Out of bounds text sizing value '20' in "), + ('d=99', f"d={_D_HI}", "Out of bounds text sizing value '99' in "), + ('v=5', "v=2", "Out of bounds text sizing value '5' in "), + ('h=3', "h=2", "Out of bounds text sizing value '3' in "), +] + + +@pytest.mark.parametrize('given_params,expected_remainder,expected_exc,', CONTROL_CODES_PARAMS_CASES) +def test_text_sizing_params_control_codes(given_params, expected_remainder, expected_exc): + """Verify control_codes='strict' and 'parse' behavior in TextSizingParams.from_params().""" + # assert control_codes='strict' raises expected exception, + with pytest.raises(ValueError) as exc_info: + TextSizingParams.from_params(given_params, control_codes='strict') + assert exc_info.value.args[0].startswith(expected_exc) + + # when 'parse' (default), any illegal argument or value is filtered, excluded, or clipped + params = TextSizingParams.from_params(given_params) + assert params.make_sequence() == expected_remainder + + +@pytest.mark.parametrize('given_params,expected_remainder,expected_exc,', CONTROL_CODES_PARAMS_CASES) +def test_text_sizing_width_control_codes(given_params, expected_remainder, expected_exc): + """Verify control_codes='strict' with invalid OSC 66 sequences in wciwdth.width().""" + seq1 = '\x1b]66;' + given_params + ';ABC' + '\x07' + seq2 = '\x1b]66;' + given_params + ';ABC' + '\x1b\\' + for seq in (seq1, seq2): + with pytest.raises(ValueError) as exc_info: + width(seq, control_codes='strict') + assert exc_info.value.args[0].startswith(expected_exc) + + +@pytest.mark.parametrize('params,text,expected_width', [ + # cases of static width=N values, + (TextSizingParams(scale=2, width=1), 'climclam', 2), + (TextSizingParams(scale=2, width=3), 'anything', 6), + (TextSizingParams(scale=1, width=5), '', 5), + (TextSizingParams(scale=3, width=1), 'x', 3), + # and automatic width (width=0) values, + (TextSizingParams(), '', 0), + (TextSizingParams(), 'AB', 2), + (TextSizingParams(), '中', 2), + (TextSizingParams(scale=2), 'AB', 4), + (TextSizingParams(scale=2), '中', 4), + (TextSizingParams(scale=3), '', 0), + (TextSizingParams(scale=7, width=7, numerator=15, denominator=15, + vertical_align=2, horizontal_align=2), 'x!yzzy', 49), +]) +def test_text_sizing_width(params, text, expected_width): + """Verify width using with both kinds of terminator.""" + # verify internal TextSizing.display_width() result, + assert TextSizing(params, text, terminator='\x07').display_width() == expected_width + assert TextSizing(params, text, terminator='\x1b\\').display_width() == expected_width + seq1 = TextSizing(params, text, terminator='\x07').make_sequence() + seq2 = TextSizing(params, text, terminator='\x1b\\').make_sequence() + + # verify round-trip + ts_match1, ts_match2 = TEXT_SIZING_PATTERN.match(seq1), TEXT_SIZING_PATTERN.match(seq2) + assert ts_match1 and ts_match2 + assert TextSizing.from_match(ts_match1) == TextSizing(params, text, terminator='\x07') + assert TextSizing.from_match(ts_match2) == TextSizing(params, text, terminator='\x1b\\') + + # and external width(), + assert width(seq1) == expected_width + assert width(seq2) == expected_width + + # verify 'strict' does not raise ValueError + width(seq1, control_codes='strict') + width(seq2, control_codes='strict') + + # and verify 'ignore' measures only inner_text (does not parse scale or width) + assert width(seq1, control_codes='ignore') == wcswidth(text) + assert width(seq2, control_codes='ignore') == wcswidth(text) + + +@pytest.mark.parametrize('given_sequence,expected_text,expected_params,expected_width', [ + ('\x1b]66;s=2:w=2;AB\x07', 'AB', 's=2:w=2', 4), + ('\x1b]66;s=2:w=2;\u4e2d\x07', '\u4e2d', 's=2:w=2', 4), + ('\x1b]66;s=3:w=1;x\x07', 'x', 's=3:w=1', 3), + ('\x1b]66;w=5;hello\x07', 'hello', 'w=5', 5), + ('\x1b]66;s=2:w=3;anything\x07', 'anything', 's=2:w=3', 6), + ('\x1b]66;w=3;x\x07', 'x', 'w=3', 3), + ('\x1b]66;s=1;AB\x07', 'AB', '', 2), + ('\x1b]66;s=2;AB\x07', 'AB', 's=2', 4), + ('\x1b]66;s=2;中\x07', '中', 's=2', 4), + ('\x1b]66;s=2;\x07', '', 's=2', 0), + ('\x1b]66;s=1:w=1;\x07', '', 'w=1', 1), + ('\x1b]66;w=2;A\x07', 'A', 'w=2', 2), + ('\x1b]66;s=2:w=3;text\x1b\\', 'text', 's=2:w=3', 6), +]) +def test_text_sizing_sequence(given_sequence, expected_text, expected_params, expected_width): + """Verify parsing and measured width of raw OSC 66 sequence.""" + ts_match = TEXT_SIZING_PATTERN.match(given_sequence) + assert ts_match is not None + text_size = TextSizing.from_match(ts_match) + assert text_size.params.make_sequence() == expected_params + assert text_size.text == expected_text + assert width(given_sequence, control_codes='parse') == expected_width + assert width(given_sequence, control_codes='strict') == expected_width + assert width(given_sequence, control_codes='ignore') == wcswidth(expected_text) + + +@pytest.mark.parametrize('text,expected', [ + ('\x1b]66;s=2:w=3:n=1:d=2:v=1:h=2;x!yzzy\x1b\\', 6), + ('\x1b]66;s=2:w=3;anything\x07', 6), + ('\x1b]66;w=3;x\x07', 3), + ('\x1b]66;s=1:w=0;AB\x07', 2), + ('\x1b]66;s=2:w=0;AB\x07', 4), + ('\x1b]66;s=2:w=0;\u4e2d\x07', 4), # '中' + ('\x1b]66;s=1:w=0;\x07', 0), + ('abc\x1b]66;w=3;x\x07def', 9), + ('\x1b]66;w=2;A\x07\x1b]66;w=3;B\x07', 5), + ('\x1b]66;s=2:w=3;text\x1b\\', 6), + ('\x1b[31m\x1b]66;w=2;AB\x07\x1b[0m', 2), +]) +def test_strings_with_text_sizing(text, expected): + """Verify measured width strings containing OSC66.""" + assert width(text) == expected + assert width(text, control_codes='strict') == expected + + +@pytest.mark.parametrize('text,expected', [ + ('\x1b]66;s=2;hello\x07', 'hello'), + ('\x1b]66;s=2;hello\x1b\\', 'hello'), + ('\x1b]66;;text\x07', 'text'), + ('\x1b]66;s=3:w=2;\x07', ''), + ('abc\x1b]66;w=2;XY\x07def', 'abcXYdef'), + ('\x1b[31m\x1b]66;s=2;red\x07\x1b[0m', 'red'), + ('\x1b]66;w=1;A\x07\x1b]66;w=1;B\x07', 'AB'), +]) +def test_strip_strings_with_text_sizing(text, expected): + assert strip_sequences(text) == expected + + +@pytest.mark.parametrize('text,expected_segs', [ + ('abc\x1b]66;s=2;hello\x07def', [('abc', False), ('\x1b]66;s=2;hello\x07', True), ('def', False)]), + ('abc\x1b]66;s=2;n=1,d=2,w=3;hello\x1b\\def', [('abc', False), ('\x1b]66;s=2;n=1,d=2,w=3;hello\x1b\\', True), ('def', False)]), +]) +def test_iter_sequences_text_sizing(text, expected_segs): + assert list(iter_sequences(text)) == expected_segs + + +@pytest.mark.parametrize('text,start,end,expected', [ + ('\x1b]66;w=3;ABC\x07', 0, 3, '\x1b]66;w=3;ABC\x07'), + ('\x1b]66;w=3;ABC\x07', 0, 2, '\x1b]66;w=2;AB\x07'), + ('\x1b]66;w=3;ABC\x07', 1, 3, '\x1b]66;w=2;BC\x07'), + ('ab\x1b]66;w=2;XY\x07cd', 0, 6, 'ab\x1b]66;w=2;XY\x07cd'), + ('ab\x1b]66;w=2;XY\x07cd', 0, 3, 'ab\x1b]66;w=1;X\x07'), + ('ab\x1b]66;w=2;XY\x07cd', 3, 6, '\x1b]66;w=1;Y\x07cd'), + ('ab\x1b]66;w=2;XY\x07cd', 4, 6, 'cd'), +]) +def test_clip_text_sizing_basic(text, start, end, expected): + """Test basic support of clip() with text sizing sequence.""" + assert repr(clip(text, start, end)) == repr(expected) + + +@pytest.mark.parametrize('text,start,end,expected', [ + ('\x1b]66;s=2;ABC\x07', 0, 0, ''), + ('\x1b]66;s=2;ABC\x07', 6, 6, ''), + ('\x1b]66;s=2;ABC\x07', 0, 2, '\x1b]66;s=2;A\x07'), + ('\x1b]66;s=2;ABC\x07', 0, 4, '\x1b]66;s=2;AB\x07'), + ('\x1b]66;s=2;ABC\x07', 0, 6, '\x1b]66;s=2;ABC\x07'), + ('\x1b]66;s=2;ABC\x07', 2, 6, '\x1b]66;s=2;BC\x07'), + ('\x1b]66;s=2;ABC\x07', 4, 6, '\x1b]66;s=2;C\x07'), +]) +def test_clip_text_sizing_scaled(text, start, end, expected): + """Test support of clip() with scale=N arguments.""" + assert repr(clip(text, start, end)) == repr(expected) + + +@pytest.mark.parametrize('text,start,end,expected', [ + # a b c + # === === === + # 012 345 678 + # . + # .. + # *a* + # *a* . + # ... *b* + # ... *b* . + # ... *b* .. + # ... *b* *c* + ('\x1b]66;s=3;ABC\x07', 0, 0, ''), + ('\x1b]66;s=3;ABC\x07', 0, 1, '.'), + ('\x1b]66;s=3;ABC\x07', 0, 2, '..'), + ('\x1b]66;s=3;ABC\x07', 0, 3, '\x1b]66;s=3;A\x07'), + ('\x1b]66;s=3;ABC\x07', 0, 4, '\x1b]66;s=3;A\x07.'), + ('\x1b]66;s=3;ABC\x07', 0, 5, '\x1b]66;s=3;A\x07..'), + ('\x1b]66;s=3;ABC\x07', 0, 6, '\x1b]66;s=3;AB\x07'), + ('\x1b]66;s=3;ABC\x07', 0, 7, '\x1b]66;s=3;AB\x07.'), + ('\x1b]66;s=3;ABC\x07', 0, 8, '\x1b]66;s=3;AB\x07..'), + ('\x1b]66;s=3;ABC\x07', 0, 9, '\x1b]66;s=3;ABC\x07'), + ('\x1b]66;s=3;ABC\x07', 0, 10, '\x1b]66;s=3;ABC\x07'), + # a b + # === === === + # 012 345 678 + # . 1, 2 + # .. 1, 3 + # .. . 1, 4 + # .. .. 1, 5 + # .. *b* 1, 6 + # .. *b* . 1, 7 + # .. *b* .. 1, 8 + # .. *b* *c* 1, 9 + ('\x1b]66;s=3;ABC\x07', 1, 1, ''), + ('\x1b]66;s=3;ABC\x07', 1, 2, '.'), + ('\x1b]66;s=3;ABC\x07', 1, 3, '..'), + ('\x1b]66;s=3;ABC\x07', 1, 4, '...'), + ('\x1b]66;s=3;ABC\x07', 1, 5, '....'), + ('\x1b]66;s=3;ABC\x07', 1, 6, '..\x1b]66;s=3;B\x07'), + ('\x1b]66;s=3;ABC\x07', 1, 7, '..\x1b]66;s=3;B\x07.'), + ('\x1b]66;s=3;ABC\x07', 1, 8, '..\x1b]66;s=3;B\x07..'), + ('\x1b]66;s=3;ABC\x07', 1, 9, '..\x1b]66;s=3;BC\x07'), + ('\x1b]66;s=3;ABC\x07', 1, 10, '..\x1b]66;s=3;BC\x07'), + # two-thirds of string 'A' and half of string 'B' is fillchar + # ('\x1b]66;s=3;ABC\x07', 2, 4, '..'), + # half of string 'A' and all of string 'B' + # a b + # === === === + # 012 345 678 + # . 2, 3 + # . . 2, 4 + # . .. 2, 5 + # . *b* 2, 6 + # . *b* . 2, 7 + # . *b* .. 2, 8 + # . *b* *c* 2, 9 + ('\x1b]66;s=3;ABC\x07', 2, 2, ''), + ('\x1b]66;s=3;ABC\x07', 2, 3, '.'), + ('\x1b]66;s=3;ABC\x07', 2, 4, '..'), + ('\x1b]66;s=3;ABC\x07', 2, 5, '...'), + ('\x1b]66;s=3;ABC\x07', 2, 6, '.\x1b]66;s=3;B\x07'), + ('\x1b]66;s=3;ABC\x07', 2, 7, '.\x1b]66;s=3;B\x07.'), + ('\x1b]66;s=3;ABC\x07', 2, 8, '.\x1b]66;s=3;B\x07..'), + ('\x1b]66;s=3;ABC\x07', 2, 9, '.\x1b]66;s=3;BC\x07'), + ('\x1b]66;s=3;ABC\x07', 2, 10, '.\x1b]66;s=3;BC\x07'), + # and now 3:10, should be easy ... + ('\x1b]66;s=3;ABC\x07', 3, 3, ''), + ('\x1b]66;s=3;ABC\x07', 3, 4, '.'), + ('\x1b]66;s=3;ABC\x07', 3, 5, '..'), + ('\x1b]66;s=3;ABC\x07', 3, 6, '\x1b]66;s=3;B\x07'), + ('\x1b]66;s=3;ABC\x07', 3, 7, '\x1b]66;s=3;B\x07.'), + ('\x1b]66;s=3;ABC\x07', 3, 8, '\x1b]66;s=3;B\x07..'), + ('\x1b]66;s=3;ABC\x07', 3, 9, '\x1b]66;s=3;BC\x07'), + ('\x1b]66;s=3;ABC\x07', 3, 10, '\x1b]66;s=3;BC\x07'), +]) +def test_clip_text_sizing_scaled_with_fillchar(text, start, end, expected): + """Test support of clip() with scale=N and fillchar is needed to fill remainder.""" + assert repr(clip(text, start, end, fillchar='.')) == repr(expected) diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py new file mode 100644 index 0000000..451b747 --- /dev/null +++ b/wcwidth/text_sizing.py @@ -0,0 +1,196 @@ +r""" +`kitty text sizing protocol`_ (OSC 66) parsing and measurement. + +The kitty text sizing protocol allows terminal apps to explicitly tell +terminals how many cells text occupies, using the escape sequence:: + + ESC ] 66 ; metadata ; text BEL/ST + +Metadata is colon-separated ``key=value`` pairs: + +- ``s``: scale +- ``w``: width in cells +- ``n``: fractional numerator +- ``d``: fractional denominator +- ``v``: vertical alignment +- ``h``: horizontal alignment + +Parsing is pretty straight-forward: + +- When ``w > 0``, return ``s * w``. +- Otherwise ``w == 0``, ``s * wcswidth(inner_text_width)`` cells. + +Numerator, denominator, and alignment codes and values are parsed but otherwise ignored +and have no effect on measurements made in this library. + +.. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ + +.. versionadded:: 0.6.1 +""" + +from __future__ import annotations + +# std imports +import re + +import typing + +# local +from ._wcswidth import wcswidth + + +class _FieldMeta(typing.NamedTuple): + name: str + low: int + high: int + default: int + + +TEXT_FIELD_MAPPING: dict[str, _FieldMeta] = { + 's': _FieldMeta(name='scale', low=1, high=7, default=1), + 'w': _FieldMeta(name='width', low=0, high=7, default=0), + 'n': _FieldMeta(name='numerator', low=0, high=15, default=0), + 'd': _FieldMeta(name='denominator', low=0, high=15, default=0), + 'v': _FieldMeta(name='vertical_align', low=0, high=2, default=0), + 'h': _FieldMeta(name='horizontal_align', low=0, high=2, default=0)} + + +class TextSizingParams(typing.NamedTuple): + """ + Parsed parameters from a text sizing escape sequence (OSC 66). + + :param scale: Scale factor (1-7). Text occupies ``scale`` rows tall and ``scale * width`` + columns wide. + :param width: Width in cells (0-7). When 0, width is auto-calculated from the inner text. + :param numerator: Fractional scaling numerator (0-15). + :param denominator: Fractional scaling denominator (0-15). + :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). + :param horizontal_align: Horizontal alignment (0=left, 1=right, 2=center). + """ + + scale: int = 1 + width: int = 0 + numerator: int = 0 + denominator: int = 0 + vertical_align: int = 0 + horizontal_align: int = 0 + + def __repr__(self) -> str: + """ + Return a compact representation including only non-default fields. + + This avoids verbose output when most fields are defaults. + """ + # modified to show values only when non-default + repr_fmt = ', '.join(f'{field.name}={getattr(self, field.name)}' + for field in TEXT_FIELD_MAPPING.values() + if getattr(self, field.name) != field.default) + return f'{self.__class__.__name__}({repr_fmt})' + + def make_sequence(self) -> str: + """Build and return sub-part of an OSC 66 sequence.""" + parts = [] + # build string for all known parameters of non-default values + for field_key, field in TEXT_FIELD_MAPPING.items(): + if (val := getattr(self, field.name)) != field.default: + parts.append(f'{field_key}={val}') + return ':'.join(parts) + + @classmethod + def from_params(cls, raw: str, control_codes: str = 'parse') -> TextSizingParams: + """ + Parse colon-separated ``key=value`` metadata string. + + :param raw: Metadata string, e.g. ``'s=2:w=3'``. + :param control_codes: 'parse' or 'strict'. + :raises ValueError: If ``control_codes='strict'`` unrecognized text sizing parameters raise + ValueError. + :returns: Parsed parameters with values clamped to valid ranges. + Unknown keys are ignored. Non-integer values use defaults. + + Example:: + + >>> TextSizingParams.from_params('s=2:w=3') + TextSizingParams(scale=2, width=3, numerator=0, denominator=0, \ + vertical_align=0, horizontal_align=0) + """ + kwargs: typing.Dict[str, int] = {} + if not raw: + return cls() + for part in raw.split(':'): + if '=' not in part: + if control_codes == 'strict': + raise ValueError(f"Expected '=' in text sizing parameter (key=val), " + f"got {part!r} in OSC 66 sequence, {raw!r}") + continue + key, _eq, val = part.partition('=') + field = TEXT_FIELD_MAPPING.get(key) + if field is None: + if control_codes == 'strict': + raise ValueError(f"Unknown text sizing field '{key}' " + f"in OSC 66 sequence, {raw!r}") + # ignore unknown fields unless 'strict' + continue + try: + value = int(val) + except ValueError as exc: + if control_codes == 'strict': + raise ValueError(f"Illegal text sizing value '{val}' " + f"in OSC 66 sequence, {raw!r}: {exc}") from exc + # ignore value, uses default value without warning unless 'strict' + continue + if control_codes == 'strict' and (value > field.high or value < field.low): + raise ValueError(f"Out of bounds text sizing value '{val}' " + f"in OSC 66 sequence, {raw!r}: " + f"allowed range for '{key}' ({field.name}) " + f"is {field.low} to {field.high}") + kwargs[field.name] = max(field.low, min(field.high, value)) + return cls(**kwargs) + + +class TextSizing(typing.NamedTuple): + """Basic horizontal width measurement for kitty text sizing protocol.""" + + params: TextSizingParams + text: str + terminator: str + + @classmethod + def from_match(cls, match: re.Match[str], control_codes: str = 'parse') -> TextSizing: + r""" + Parse using matching OSC 66 Sequence. + + :param match: match object from :attr:`wcwidth.escape_sequences.TEXT_SIZING_PATTERN`. + :param control_codes: 'parse' or 'strict', same meaning as delegated by + :func:`wcwidth.width`. + :raises ValueError: When ``control_codes='strict'`` for unrecognized, invalid, or out of + bounds text sizing parameters. + :returns: TextSizing object from parsed sequence + + Example:: + + from wcwidth.escape_sequences import TEXT_SIZING_PATTERN + >>> TextSizing.from_match(TEXT_SIZING_PATTERN.match('\x1b]66;w=2;XY\x07')) + TextSizing(params=TextSizingParams(scale=1, width=2, numerator=0, denominator=0, \ + vertical_align=0, horizontal_align=0), text='XY', terminator='\x07') + """ + return cls(params=TextSizingParams.from_params(match.group(1), control_codes=control_codes), + text=match.group(2), + terminator=match.group(3)) + + def display_width(self, ambiguous_width: int = 1) -> int: + """ + Calculate the display width of a text sizing sequence. + + :param ambiguous_width: Width for East Asian Ambiguous characters. + :returns: Display width in terminal cells. When ``width > 0``, returns ``params.scale * + params.width``. When ``width == 0``, returns ``params.scale * measured_inner_width``. + """ + if self.params.width > 0: + return self.params.scale * self.params.width + w = wcswidth(self.text, ambiguous_width=ambiguous_width) + return self.params.scale * max(0, w) + + def make_sequence(self) -> str: + """Build and return complete OSC 66 Terminal Sequence.""" + return f'\x1b]66;{self.params.make_sequence()};{self.text}{self.terminator}' From f980a1e60e7d29c0c338dbacfa70e3647a825c44 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 00:43:24 -0400 Subject: [PATCH 31/70] use python 3.13 docformatter --- tests/test_clip_cursors.py | 2 +- tests/test_textwrap.py | 2 +- tox.ini | 6 +++--- wcwidth/wcwidth.py | 22 +++++++++++----------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/test_clip_cursors.py b/tests/test_clip_cursors.py index f19e8e9..3da99b2 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_cursors.py @@ -57,4 +57,4 @@ def test_clip_cursor_sequences_expected_behaviour(text, start, end, expected): the clipped output if the moved-to columns are within the clip window; cursor-left allows subsequent characters to overwrite previous content and the clip should reflect that. """ - assert repr(clip(text, start, end)) == repr(expected) \ No newline at end of file + assert repr(clip(text, start, end)) == repr(expected) diff --git a/tests/test_textwrap.py b/tests/test_textwrap.py index b264472..b502c73 100644 --- a/tests/test_textwrap.py +++ b/tests/test_textwrap.py @@ -502,7 +502,7 @@ def test_wrap_replace_whitespace_false_newlines_zero_width(): ('abc' + TS3 + 'def', 8, ['abc', TS3 + 'def']), ('abc' + TS3 + 'def', 10, ['abc' + TS3 + 'def']), # Sequence stays with preceding word when total stripped width fits - ('aa' + TS3 + 'bb', 5, ['aa', TS3 + 'bb']), + ('aa' + TS3 + 'bb', 5, ['aa', TS3 + 'bb']), ('pre' + TS3 + 'post', 8, ['pre', TS3 + 'post']), ]) def test_wrap_ts_line_fill(text, w, expected): diff --git a/tox.ini b/tox.ini index df77374..8c5d19a 100644 --- a/tox.ini +++ b/tox.ini @@ -165,13 +165,13 @@ commands = pydocstyle --source --explain {toxinidir}/wcwidth [testenv:docformatter] basepython = python3.13 -deps = docformatter>=1.7.8 +deps = docformatter==1.7.7 commands = docformatter --in-place --recursive --pre-summary-newline \ --wrap-summaries=100 --wrap-descriptions=100 \ {toxinidir}/wcwidth/ {toxinidir}/bin {toxinidir}/tests/ [testenv:docformatter_check] -basepython = python3.14 +basepython = python3.13 deps = {[testenv:docformatter]deps} commands = docformatter --check --diff --recursive --pre-summary-newline \ --wrap-summaries=100 --wrap-descriptions=100 \ @@ -214,7 +214,7 @@ commands = codespell --skip="*.pyc,htmlcov,_build,build,*.egg-info,.tox,data,./t --summary --count [testenv:format] -basepython = python3.14 +basepython = python3.13 deps = {[testenv:isort]deps} {[testenv:docformatter]deps} {[testenv:autopep8]deps} diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index b969fb8..79efd76 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -63,7 +63,8 @@ # std imports from functools import lru_cache -from typing import Literal, NamedTuple, Union + +from typing import Union, Literal, NamedTuple # local # pylint: disable=unused-import @@ -103,7 +104,6 @@ strip_sequences) from .unicode_versions import list_versions - # Token types for output_tokens used by clip(). # NamedTuple subclasses provide named attribute access while remaining # plain tuples at runtime — zero overhead over the old bare-tuple approach, @@ -418,11 +418,11 @@ def _write_cells(s: str, w: int, write_col: int) -> None: # 1. Overwriting the right half of a wide char leaves left half as fillchar if write_col > 0 and write_col - 1 in cells and cells[write_col - 1][1] == 2: cells[write_col - 1] = (fillchar, 1) - + # 2. Overwriting the left half of a wide char leaves right half as fillchar if w == 1 and write_col in cells and cells[write_col][1] == 2: cells[write_col + 1] = (fillchar, 1) - + if w == 2 and write_col + 1 in cells and cells[write_col + 1][1] == 2: cells[write_col + 2] = (fillchar, 1) @@ -430,7 +430,7 @@ def _write_cells(s: str, w: int, write_col: int) -> None: for i in range(w): if write_col + i in cells: del cells[write_col + i] - + cells[write_col] = (s, w) if propagate_sgr and sgr_at_clip_start is None: @@ -551,13 +551,13 @@ def _append_seq(seq: str, at_col: int | None = None) -> None: # Reconstruct result from painter's algorithm grid. parts: list[str] = [] - + seqs_by_col: dict[int, list[tuple[int, str]]] = {} for seq_col, seq_ord, seq_text in sequences: if seq_col not in seqs_by_col: seqs_by_col[seq_col] = [] seqs_by_col[seq_col].append((seq_ord, seq_text)) - + for c in seqs_by_col: seqs_by_col[c].sort() @@ -577,14 +577,14 @@ def _append_seq(seq: str, at_col: int | None = None) -> None: if walk_col in cells: cell_text, cell_w = cells[walk_col] - + # Calculate overlap with [start, end) cell_start = walk_col cell_end = walk_col + cell_w - + overlap_start = max(start, cell_start) overlap_end = min(end, cell_end) - + if overlap_start < overlap_end: if cell_start >= start and cell_end <= end: # Fully inside @@ -592,7 +592,7 @@ def _append_seq(seq: str, at_col: int | None = None) -> None: else: # Partially inside (split wide char) parts.append(fillchar * (overlap_end - overlap_start)) - + walk_col += cell_w else: # It's a hole. Only emit fillchar if we are inside the clip window From 24f1cdc31cb7320cba41eeee1ac731e81fd20b6b Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 01:20:58 -0400 Subject: [PATCH 32/70] refactor --- tests/test_benchmarks.py | 26 +--- wcwidth/__init__.py | 14 +-- wcwidth/wcwidth.py | 248 +++++++++++++++++++-------------------- 3 files changed, 127 insertions(+), 161 deletions(-) diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index f26ced5..03c7e86 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -293,34 +293,16 @@ def test_clip_complex_sgr(benchmark): benchmark(wcwidth.clip, text, 6, 11) -# OSC 66 (kitty text sizing protocol) benchmarks -OSC66_SMALL = '\x1b]66;w=2;XY\x07' -OSC66_SCALED = '\x1b]66;s=3;ABC\x07' -OSC66_LONG = ( - '\x1b]66;w=2;XY\x07' * 5 + - 'interleaved text ' * 5 + - '\x1b]66;s=3;ABC\x07' * 5 -) - - -@pytest.mark.parametrize('label,text', [ - ('small', OSC66_SMALL), - ('scaled', OSC66_SCALED), - ('long', OSC66_LONG), -], ids=lambda v: f'osc66_{v}') -def test_width_osc66(benchmark, label, text): +def test_width_osc66(benchmark): """Benchmark width() with OSC 66 sequences.""" + text = '\x1b]66;w=2;XY\x07\x1b]66;s=3;ABC\x07' benchmark(wcwidth.width, text) -@pytest.mark.parametrize('label,text,start,end', [ - ('small', OSC66_SMALL, 0, 2), - ('scaled', OSC66_SCALED, 0, 9), - ('long', OSC66_LONG, 10, 60), -], ids=lambda v: f'osc66_{v}') def test_clip_osc66(benchmark, label, text, start, end): """Benchmark clip() with OSC 66 sequences.""" - benchmark(wcwidth.clip, text, start, end) + text = '\x1b]66;w=2;XY\x07\x1b]66;s=3;ABC\x07' + benchmark(wcwidth.clip, text, 3, 8) def test_propagate_sgr_multiline(benchmark): diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index ad43abf..906ec29 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -43,16 +43,4 @@ # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places # Prefer the installed distribution version when available (helps test environments) -try: - # std imports - from importlib import metadata as importlib_metadata -except ImportError: # pragma: no cover - fallback for very old Pythons - importlib_metadata = None # type: ignore[assignment] - -if importlib_metadata is not None: - try: - __version__ = importlib_metadata.version('wcwidth') - except importlib_metadata.PackageNotFoundError: - __version__ = '0.6.1' -else: - __version__ = '0.6.1' # don't forget to also update pyproject.toml:version +__version__ = '0.6.1' # don't forget to also update pyproject.toml:version diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 79efd76..4d25cb3 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -64,7 +64,7 @@ # std imports from functools import lru_cache -from typing import Union, Literal, NamedTuple +from typing import Callable, Union, Literal, NamedTuple # local # pylint: disable=unused-import @@ -415,24 +415,15 @@ def clip( def _write_cells(s: str, w: int, write_col: int) -> None: nonlocal sgr_at_clip_start if w > 0: - # 1. Overwriting the right half of a wide char leaves left half as fillchar - if write_col > 0 and write_col - 1 in cells and cells[write_col - 1][1] == 2: - cells[write_col - 1] = (fillchar, 1) - - # 2. Overwriting the left half of a wide char leaves right half as fillchar - if w == 1 and write_col in cells and cells[write_col][1] == 2: - cells[write_col + 1] = (fillchar, 1) - - if w == 2 and write_col + 1 in cells and cells[write_col + 1][1] == 2: - cells[write_col + 2] = (fillchar, 1) - - # 3. Clean up the cells we are fully overwriting - for i in range(w): - if write_col + i in cells: - del cells[write_col + i] - + # Fix up wide-char orphans and clear overwritten cells in one pass + for offset in range(w): + src_col = write_col + offset + if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: + cells[src_col - 1] = (fillchar, 1) + if cells.get(src_col, ('', 0))[1] == 2: + cells[src_col + 1] = (fillchar, 1) + cells.pop(src_col, None) cells[write_col] = (s, w) - if propagate_sgr and sgr_at_clip_start is None: sgr_at_clip_start = sgr @@ -451,39 +442,34 @@ def _append_seq(seq: str, at_col: int | None = None) -> None: if col >= end and sgr_at_clip_start is not None and char != '\x1b': break - # 1. Handle escape sequences + # 1. Handle escape sequences and bare ESC if char == '\x1b': if (match := ZERO_WIDTH_PATTERN.match(text, idx)): seq = match.group() if (propagate_sgr and sgr) and _SGR_PATTERN.match(seq): # Update SGR state; will be applied as prefix when visible content starts sgr = _sgr_state_update(sgr, seq) - # we've consumed the sequence; advance index and continue idx = match.end() continue - # Handle cursor movement sequences specially to simulate visible - # effects (fillchar padding for rightward moves, overwrite for left). - if (match_cleft := CURSOR_RIGHT_SEQUENCE.match(seq)): - # parse numeric argument (default 1) - digit_txt = match_cleft.group(1) - n_left = int(digit_txt) if digit_txt else 1 - # If movement crosses into the clip window, emit fillchars - move_start = col - move_end = col + n_left - if move_start < end and move_end > start: - overlap_start = max(move_start, start) - overlap_end = min(move_end, end) - for i in range(overlap_start, overlap_end): + # Cursor-forward sequences (e.g. CSI n C) advance the column; + # simulate by emitting fillchars for the visible portion. + if (match_cforward := CURSOR_RIGHT_SEQUENCE.match(seq)): + digit_txt = match_cforward.group(1) + n_forward = int(digit_txt) if digit_txt else 1 + move_end = col + n_forward + if col < end and move_end > start: + for i in range(max(col, start), min(move_end, end)): _write_cells(fillchar, 1, i) - col += n_left + col = move_end idx = match.end() continue - if (match_cright := CURSOR_LEFT_SEQUENCE.match(seq)): - digit_txt = match_cright.group(1) - n_right = int(digit_txt) if digit_txt else 1 - col = max(0, col - n_right) + # Cursor-backward sequences (e.g. CSI n D) retreat the column. + if (match_cbackward := CURSOR_LEFT_SEQUENCE.match(seq)): + digit_txt = match_cbackward.group(1) + n_backward = int(digit_txt) if digit_txt else 1 + col = max(0, col - n_backward) idx = match.end() continue @@ -500,16 +486,15 @@ def _append_seq(seq: str, at_col: int | None = None) -> None: idx = match.end() continue - # Other zero-width sequences (OSC hyperlinks, etc.) — preserve as-is + # Other zero-width sequences (OSC hyperlinks, etc.) are preserved as-is _append_seq(seq) idx = match.end() continue - - # 2. Handle bare ESC (not a valid sequence) - if char == '\x1b': - _append_seq(char) - idx += 1 - continue + else: + # Bare ESC not matching any recognized sequence pattern + _append_seq(char) + idx += 1 + continue # 3. TAB expansion if char == '\t': @@ -530,46 +515,43 @@ def _append_seq(seq: str, at_col: int | None = None) -> None: grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) if grapheme_w == 0: - # combining/zero-width grapheme; preserve as sequence-like token at this column + # combining/zero-width grapheme; preserve as token at this column if start <= col < end: _append_seq(grapheme) elif col >= start and col + grapheme_w <= end: # Fully visible _write_cells(grapheme, grapheme_w, col) - col += grapheme_w elif col < end and col + grapheme_w > start: - # Partially visible (wide char at boundary) -> emit fillchars for visible portion - overlap = min(end, col + grapheme_w) - max(start, col) - abs_start = max(start, col) - for i in range(overlap): - _write_cells(fillchar, 1, abs_start + i) - col += grapheme_w - else: - col += grapheme_w - + # Partially visible (wide char at boundary) — emit fillchars + clip_start = max(start, col) + for i in range(min(end, col + grapheme_w) - clip_start): + _write_cells(fillchar, 1, clip_start + i) + # advance column whether visible or not + col += grapheme_w idx += len(grapheme) - # Reconstruct result from painter's algorithm grid. - parts: list[str] = [] - + # ── Reconstruct result from painter's algorithm grid ────────────────── + # Build column→sorted sequences index seqs_by_col: dict[int, list[tuple[int, str]]] = {} - for seq_col, seq_ord, seq_text in sequences: - if seq_col not in seqs_by_col: - seqs_by_col[seq_col] = [] - seqs_by_col[seq_col].append((seq_ord, seq_text)) - - for c in seqs_by_col: - seqs_by_col[c].sort() - - max_col = max((max(cells.keys()) + 1 if cells else 0), - (max(seqs_by_col.keys()) if seqs_by_col else 0)) - + for col_pos, order, seq_text in sequences: + seqs_by_col.setdefault(col_pos, []).append((order, seq_text)) + for entries in seqs_by_col.values(): + entries.sort() + + max_cell_col = max(cells.keys()) if cells else -1 + max_seq_col = max(seqs_by_col.keys()) if seqs_by_col else -1 + max_col = max(max_cell_col, max_seq_col) + + # Walk columns 0..min(max_col, end), emitting sequences then any cell + # or fillchar occupying each position. Visits *inclusive* of + # min(max_col, end) so sequences at `end` are preserved. + parts: list[str] = [] walk_col = 0 - # walk_col reaches exactly up to end, to ensure sequences at `end` are processed - while walk_col <= max_col and walk_col <= end: - if walk_col in seqs_by_col: - for _, seq_text in seqs_by_col[walk_col]: - parts.append(seq_text) + col_limit = min(max_col, end) + while walk_col <= col_limit: + # Zero-width sequences at this column + for _, seq_text in seqs_by_col.get(walk_col, ()): + parts.append(seq_text) if walk_col >= end: walk_col += 1 @@ -577,34 +559,27 @@ def _append_seq(seq: str, at_col: int | None = None) -> None: if walk_col in cells: cell_text, cell_w = cells[walk_col] - - # Calculate overlap with [start, end) - cell_start = walk_col cell_end = walk_col + cell_w - overlap_start = max(start, cell_start) - overlap_end = min(end, cell_end) - - if overlap_start < overlap_end: - if cell_start >= start and cell_end <= end: - # Fully inside - parts.append(cell_text) - else: - # Partially inside (split wide char) - parts.append(fillchar * (overlap_end - overlap_start)) + if walk_col >= start and cell_end <= end: + # Fully inside clip window + parts.append(cell_text) + elif cell_end > start: + # Partial overlap (wide char split at boundary) + parts.append(fillchar * (min(cell_end, end) - max(walk_col, start))) + # else: cell entirely before start — skip walk_col += cell_w else: - # It's a hole. Only emit fillchar if we are inside the clip window - # AND if we are within the bounds of where visible text was written. - if walk_col >= start and cells and walk_col < max(cells.keys()) + 1: + # Hole: emit fillchar for columns inside [start, end) that + # lie within the written cell area + if walk_col >= start and walk_col <= max_cell_col: parts.append(fillchar) walk_col += 1 - # Append any remaining sequences that occurred past the clip end boundary - # This preserves SGR resets and trailing hyperlinks if the loop broke early + # Trailing sequences past col_limit (SGR resets after short text, etc.) for c in sorted(seqs_by_col.keys()): - if c > end or (c == end and c > max_col): + if c > col_limit: for _, seq_text in seqs_by_col[c]: parts.append(seq_text) @@ -626,79 +601,100 @@ def _text_sizing_clip( col: int, start: int, end: int, - write_cells, + write_cells: Callable[[str, int, int], None], fillchar: str = ' ', ambiguous_width: int = 1, ) -> int: """ - Emit tokens for a text-sizing sequence, clipped to ``[start, end)``. + Emit tokens for a text-sizing (OSC 66) sequence, clipped to ``[start, end)``. - Returns ``new_col``. + Returns ``new_col`` (column position after the sequence). """ # pylint: disable=too-many-locals ts_width = ts.display_width(ambiguous_width) + + # Sequence fully visible or fully outside: simple cases if col >= start and col + ts_width <= end: write_cells(ts.make_sequence(), ts_width, col) return col + ts_width if col >= end or col + ts_width <= start: return col + ts_width - # Partial overlap: decompose into units (graphemes at `scale` cells each), - # emit whole units as sequences and partial units as fillchars. + # Partial overlap: the sequence straddles a clip boundary. + # Decompose into unit cells (each grapheme occupies `scale` cells), + # emit as many whole units as fit inside [start, end), filling the + # remainder with `fillchar`. rel_start = max(0, start - col) rel_end = min(end, col + ts_width) - col scale = ts.params.scale + # Build the list of (grapheme, cell_width) units units: list[tuple[str, int]] = [] if ts.params.width > 0: - inner_graphemes = list(iter_graphemes(ts.text)) - for j in range(ts.params.width): - g = inner_graphemes[j] if j < len(inner_graphemes) else '' + # Fixed-width mode: explicit count at `scale` cells each. + # Use itertools.islice to avoid materializing the full grapheme list. + from itertools import islice + for j, g in enumerate(islice(iter_graphemes(ts.text), ts.params.width)): units.append((g, scale)) + # Pad with empty graphemes if text had fewer than width + for _ in range(ts.params.width - len(units)): + units.append(('', scale)) else: + # Auto-width mode: grapheme count derived from content, width varies for g in iter_graphemes(ts.text): units.append((g, width(g, ambiguous_width=ambiguous_width) * scale)) - pos = 0 - pending_texts: list[str] = [] + # Batch of consecutive fully-visible units that can be emitted as a + # single text-sizing sequence. + pending_units: list[tuple[str, int]] = [] # (grapheme_text, cell_width) - def flush(flush_col: int, flush_width: int): - if not pending_texts: + def flush(flush_col: int) -> None: + """Emit accumulated graphemes as one text-sizing sequence.""" + if not pending_units: return + texts = [u[0] for u in pending_units] + total_w = sum(u[1] for u in pending_units) params = TextSizingParams( scale, - len(pending_texts) if ts.params.width > 0 else 0, + len(texts) if ts.params.width > 0 else 0, ts.params.numerator, ts.params.denominator, ts.params.vertical_align, ts.params.horizontal_align) - write_cells(TextSizing(params, ''.join(pending_texts), ts.terminator).make_sequence(), flush_width, flush_col) - pending_texts.clear() - + write_cells( + TextSizing(params, ''.join(texts), ts.terminator).make_sequence(), + total_w, + flush_col) + pending_units.clear() + + # Walk units in cell-coordinate space, collecting consecutive fully-visible + # ones into a batch (flushed as one sequence) and emitting fillchars for + # partial units at the boundaries. flush_col_pos = col + rel_start - flush_width = 0 + unit_pos = 0 # current position in cell-coordinates within the sequence for unit_text, unit_w in units: - unit_start = pos - unit_end = pos + unit_w + unit_end = unit_pos + unit_w if unit_end <= rel_start: - pos = unit_end + # Unit is entirely before the clip window + unit_pos = unit_end continue - if unit_start >= rel_end: + if unit_pos >= rel_end: + # Unit is entirely past the clip window break - overlap = min(unit_end, rel_end) - max(unit_start, rel_start) + + overlap = min(unit_end, rel_end) - max(unit_pos, rel_start) if overlap == unit_w and unit_w > 0: - if not pending_texts: - flush_col_pos = col + max(unit_start, rel_start) - flush_width = 0 - pending_texts.append(unit_text) - flush_width += overlap + # Unit fits completely — batch it with others + if not pending_units: + flush_col_pos = col + max(unit_pos, rel_start) + pending_units.append((unit_text, unit_w)) else: - flush(flush_col_pos, flush_width) - flush_width = 0 - abs_start = col + max(unit_start, rel_start) + # Unit is partially clipped — flush batch, emit fillchars for remainder + flush(flush_col_pos) + abs_start = col + max(unit_pos, rel_start) for i in range(overlap): write_cells(fillchar, 1, abs_start + i) - pos = unit_end + unit_pos = unit_end - flush(flush_col_pos, flush_width) + flush(flush_col_pos) return col + ts_width From 08a1c635a71c6d7c408ca66f1467a1779b8e9d41 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 01:29:59 -0400 Subject: [PATCH 33/70] lint and turn if/else inward --- wcwidth/wcwidth.py | 98 +++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 4d25cb3..a2fd1bc 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -444,58 +444,58 @@ def _append_seq(seq: str, at_col: int | None = None) -> None: # 1. Handle escape sequences and bare ESC if char == '\x1b': - if (match := ZERO_WIDTH_PATTERN.match(text, idx)): - seq = match.group() - if (propagate_sgr and sgr) and _SGR_PATTERN.match(seq): - # Update SGR state; will be applied as prefix when visible content starts - sgr = _sgr_state_update(sgr, seq) - idx = match.end() - continue - - # Cursor-forward sequences (e.g. CSI n C) advance the column; - # simulate by emitting fillchars for the visible portion. - if (match_cforward := CURSOR_RIGHT_SEQUENCE.match(seq)): - digit_txt = match_cforward.group(1) - n_forward = int(digit_txt) if digit_txt else 1 - move_end = col + n_forward - if col < end and move_end > start: - for i in range(max(col, start), min(move_end, end)): - _write_cells(fillchar, 1, i) - col = move_end - idx = match.end() - continue - - # Cursor-backward sequences (e.g. CSI n D) retreat the column. - if (match_cbackward := CURSOR_LEFT_SEQUENCE.match(seq)): - digit_txt = match_cbackward.group(1) - n_backward = int(digit_txt) if digit_txt else 1 - col = max(0, col - n_backward) - idx = match.end() - continue - - if (ts_match := TEXT_SIZING_PATTERN.match(seq)): - # OSC 66 (text sizing) has positive width - col = _text_sizing_clip( - TextSizing.from_match(ts_match), - col=col, start=start, end=end, - write_cells=_write_cells, - fillchar=fillchar, ambiguous_width=ambiguous_width, - ) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - idx = match.end() - continue - - # Other zero-width sequences (OSC hyperlinks, etc.) are preserved as-is - _append_seq(seq) - idx = match.end() - continue - else: + if not (match := ZERO_WIDTH_PATTERN.match(text, idx)): # Bare ESC not matching any recognized sequence pattern _append_seq(char) idx += 1 continue + seq = match.group() + if (propagate_sgr and sgr) and _SGR_PATTERN.match(seq): + # Update SGR state; will be applied as prefix when visible content starts + sgr = _sgr_state_update(sgr, seq) + idx = match.end() + continue + + # Cursor-forward sequences (e.g. CSI n C) advance the column; + # simulate by emitting fillchars for the visible portion. + if (match_cforward := CURSOR_RIGHT_SEQUENCE.match(seq)): + digit_txt = match_cforward.group(1) + n_forward = int(digit_txt) if digit_txt else 1 + move_end = col + n_forward + if col < end and move_end > start: + for i in range(max(col, start), min(move_end, end)): + _write_cells(fillchar, 1, i) + col = move_end + idx = match.end() + continue + + # Cursor-backward sequences (e.g. CSI n D) retreat the column. + if (match_cbackward := CURSOR_LEFT_SEQUENCE.match(seq)): + digit_txt = match_cbackward.group(1) + n_backward = int(digit_txt) if digit_txt else 1 + col = max(0, col - n_backward) + idx = match.end() + continue + + if (ts_match := TEXT_SIZING_PATTERN.match(seq)): + # OSC 66 (text sizing) has positive width + col = _text_sizing_clip( + TextSizing.from_match(ts_match), + col=col, start=start, end=end, + write_cells=_write_cells, + fillchar=fillchar, ambiguous_width=ambiguous_width, + ) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + idx = match.end() + continue + + # Other zero-width sequences (OSC hyperlinks, etc.) are preserved as-is + _append_seq(seq) + idx = match.end() + continue + # 3. TAB expansion if char == '\t': if tabsize > 0: @@ -573,7 +573,7 @@ def _append_seq(seq: str, at_col: int | None = None) -> None: else: # Hole: emit fillchar for columns inside [start, end) that # lie within the written cell area - if walk_col >= start and walk_col <= max_cell_col: + if start <= walk_col <= max_cell_col: parts.append(fillchar) walk_col += 1 @@ -634,7 +634,7 @@ def _text_sizing_clip( # Fixed-width mode: explicit count at `scale` cells each. # Use itertools.islice to avoid materializing the full grapheme list. from itertools import islice - for j, g in enumerate(islice(iter_graphemes(ts.text), ts.params.width)): + for _, g in enumerate(islice(iter_graphemes(ts.text), ts.params.width)): units.append((g, scale)) # Pad with empty graphemes if text had fewer than width for _ in range(ts.params.width - len(units)): From 34b6db75c9b59f7a34eb0236a71e6c83fee9366d Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 02:33:32 -0400 Subject: [PATCH 34/70] checkpoint --- code_templates/grapheme_table.py.j2 | 2 +- code_templates/python_table.py.j2 | 2 +- code_templates/unicode_versions.py.j2 | 2 +- docs/intro.rst | 5 +- docs/unicode_version.rst | 15 + pyproject.toml | 2 +- wcwidth/__init__.py | 43 ++- wcwidth/_clip.py | 417 +++++++++++++++++++++++ wcwidth/_width.py | 67 ++-- wcwidth/escape_sequences.py | 15 + wcwidth/table_ambiguous.py | 3 +- wcwidth/table_grapheme.py | 3 +- wcwidth/table_mc.py | 3 +- wcwidth/table_vs16.py | 3 +- wcwidth/table_wide.py | 3 +- wcwidth/table_zero.py | 3 +- wcwidth/text_sizing.py | 2 +- wcwidth/wcwidth.py | 466 +------------------------- 18 files changed, 512 insertions(+), 544 deletions(-) create mode 100644 wcwidth/_clip.py diff --git a/code_templates/grapheme_table.py.j2 b/code_templates/grapheme_table.py.j2 index 6596613..424f061 100644 --- a/code_templates/grapheme_table.py.j2 +++ b/code_templates/grapheme_table.py.j2 @@ -4,7 +4,7 @@ Exports grapheme cluster break property tables for Unicode version {{ unicode_ve This module provides lookup tables for Unicode grapheme cluster break properties as defined in UAX #29: Unicode Text Segmentation. -This code generated by {{this_filepath}} on {{utc_now}}. +This code generated by python wcwidth project. """ # pylint: disable=duplicate-code {%- for var_name, table_def in tables.items() %} diff --git a/code_templates/python_table.py.j2 b/code_templates/python_table.py.j2 index ec818c2..4591025 100644 --- a/code_templates/python_table.py.j2 +++ b/code_templates/python_table.py.j2 @@ -1,7 +1,7 @@ """ Exports {{ variable_name }} table keyed by supporting unicode version level. -This code generated by {{this_filepath}} on {{utc_now}}. +This code generated by python wcwidth project. """ # pylint: disable=duplicate-code {{ variable_name }} = { diff --git a/code_templates/unicode_versions.py.j2 b/code_templates/unicode_versions.py.j2 index 7bd53c2..998323b 100644 --- a/code_templates/unicode_versions.py.j2 +++ b/code_templates/unicode_versions.py.j2 @@ -1,7 +1,7 @@ """ Exports function list_versions() for unicode version level support. -This code generated by {{this_filepath}} on {{utc_now}}. +This code generated by python wcwidth project. """ from __future__ import annotations diff --git a/docs/intro.rst b/docs/intro.rst index a4ac498..808fad7 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -468,8 +468,9 @@ languages. History ======= -0.6.1 *2026-04-26* - * **New** `width()` now supports `kitty text sizing protocol`_ (OSC 66). +0.7.0 *2026-04-30* + * **Improved** `clip()` to support backward cursor sequence overwrite, "Painter's algorithm". + * **Improved** `width()` and `clip()` to support parsing of `kitty text sizing protocol`_ (OSC 66). 0.6.0 *2026-02-06* * **New** Parameters ``expand_tabs``, ``replace_whitespace``, ``fix_sentence_endings``, diff --git a/docs/unicode_version.rst b/docs/unicode_version.rst index 41a1e52..38ff78d 100644 --- a/docs/unicode_version.rst +++ b/docs/unicode_version.rst @@ -16,6 +16,21 @@ release files: ``emoji-variation-sequences-12.0.0.txt`` *Date: 2019-01-15, 12:10:05 GMT* +``emoji-variation-sequences-13.0.0.txt`` + *Date: 2020-01-21, 07:15:05 GMT* + +``emoji-variation-sequences-14.0.0.txt`` + *Date: 2021-06-08, 05:19:16 GMT* + +``emoji-variation-sequences-15.0.0.txt`` + *Date: 2022-05-13, 21:54:24 GMT* + +``emoji-variation-sequences-15.1.0.txt`` + *Date: 2023-02-01, 02:22:54 GMT* + +``emoji-variation-sequences-16.0.0.txt`` + *Date: 2024-05-01, 21:25:24 GMT* + ``emoji-variation-sequences-17.0.0.txt`` *Date: 2025-01-30, 21:48:29 GMT* diff --git a/pyproject.toml b/pyproject.toml index a713f87..5b04904 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "hatchling" ] [project] name = "wcwidth" -version = "0.6.1" # don't forget to also update wcwidth/__init__.py:__version__ +version = "0.7.0" # don't forget to also update wcwidth/__init__.py:__version__ description = "Measures the displayed width of unicode strings in a terminal" readme = "README.rst" keywords = [ diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 906ec29..dcece0a 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -1,37 +1,32 @@ """ -Wcwidth module. +Python 'wcwidth' module. https://github.com/jquast/wcwidth """ -# re-export all functions & definitions, even private ones, from top-level -# module path, to allow for 'from wcwidth import _private_func'. Of course, -# user beware that any _private functions or variables not exported by __all__ -# may disappear or change signature at any future version. +# re-export common and outermost functions & definitions, even a few private +# ones, some for convenience, others for legacy, only the items in __all__ are +# documented as public API # local -from .wcwidth import ZERO_WIDTH # noqa -from .wcwidth import (WIDE_EASTASIAN, - AMBIGUOUS_EASTASIAN, - VS16_NARROW_TO_WIDE, - clip, - ljust, - rjust, - width, - center, - wcwidth, - wcswidth, - list_versions, - iter_sequences, - strip_sequences, - _wcmatch_version, - _wcversion_value) +from ._clip import clip +from ._align import ljust, rjust, center +from ._width import width +# legacy +from .unicode_versions import list_versions +from ._wcwidth import wcwidth from .bisearch import bisearch as _bisearch -from .grapheme import grapheme_boundary_before # noqa -from .grapheme import iter_graphemes, iter_graphemes_reverse +from .grapheme import iter_graphemes, iter_graphemes_reverse, grapheme_boundary_before from .textwrap import SequenceTextWrapper, wrap +from ._wcswidth import wcswidth from .sgr_state import propagate_sgr +from .table_vs16 import VS16_NARROW_TO_WIDE +from .table_wide import WIDE_EASTASIAN +# convenience +from .table_zero import ZERO_WIDTH from .text_sizing import TextSizing, TextSizingParams +from .table_ambiguous import AMBIGUOUS_EASTASIAN +from .escape_sequences import iter_sequences, strip_sequences # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". @@ -43,4 +38,4 @@ # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places # Prefer the installed distribution version when available (helps test environments) -__version__ = '0.6.1' # don't forget to also update pyproject.toml:version +__version__ = '0.7.0' # don't forget to also update pyproject.toml:version diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py new file mode 100644 index 0000000..f443339 --- /dev/null +++ b/wcwidth/_clip.py @@ -0,0 +1,417 @@ +"""This is a python implementation of clip().""" +# std imports +from itertools import islice + +from typing import Union, Callable, NamedTuple + +# local +from .grapheme import iter_graphemes +from .sgr_state import (_SGR_PATTERN, + _SGR_STATE_DEFAULT, + _sgr_state_update, + _sgr_state_is_active, + _sgr_state_to_sequence) +from .text_sizing import TextSizing, TextSizingParams +from .escape_sequences import _SEQUENCE_CLASSIFY + + +class VisToken(NamedTuple): + """A visible text segment with its display width and starting column.""" + + text: str + width: int + start_col: int + + +class SeqToken(NamedTuple): + """A zero-width terminal sequence (escape sequences, control chars, etc.).""" + + text: str + + +Token = Union[VisToken, SeqToken] + + +def clip( + text: str, + start: int, + end: int, + *, + fillchar: str = ' ', + tabsize: int = 8, + ambiguous_width: int = 1, + propagate_sgr: bool = True, +) -> str: + r""" + Clip text to display columns ``(start, end)`` while preserving all terminal sequences. + + This function extracts a substring based on visible column positions rather than + character indices. Terminal escape sequences are preserved in the output since + they have zero display width. If a wide character (width 2) would be split at + either boundary, it is replaced with ``fillchar``. + + TAB characters (``\t``) are expanded to spaces up to the next tab stop, + controlled by the ``tabsize`` parameter. + + Other cursor movement characters (backspace, carriage return) and cursor + movement sequences are passed through unchanged as zero-width. + + :param text: String to clip, may contain terminal escape sequences. + :param start: Absolute starting column (inclusive, 0-indexed). + :param end: Absolute ending column (exclusive). + :param fillchar: Character to use when a wide character must be split at + a boundary (default space). Must have display width of 1. + :param tabsize: Tab stop width (default 8). Set to 0 to pass tabs through + as zero-width (preserved in output but don't advance column position). + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :param propagate_sgr: If True (default), SGR (terminal styling) sequences + are propagated. The result begins with any active style at the start + position and ends with a reset sequence if styles are active. + :returns: Substring of ``text`` spanning display columns ``(start, end)``, + with all terminal sequences preserved and wide characters at boundaries + replaced with ``fillchar``. + + SGR (terminal styling) sequences are propagated by default. The result + begins with any active style and ends with a reset:: + + >>> clip('\x1b[1;34mHello world\x1b[0m', 6, 11) + '\x1b[1;34mworld\x1b[0m' + + Set ``propagate_sgr=False`` to disable this behavior. + + .. versionadded:: 0.3.0 + + .. versionchanged:: 0.5.0 + Added ``propagate_sgr`` parameter (default True). + + .. versionchanged:: 0.6.1 + Parses OSC 66 Sequences. + + Example:: + + >>> clip('hello world', 0, 5) + 'hello' + >>> clip('中文字', 0, 3) # Wide char split at column 3 + '中 ' + >>> clip('a\tb', 0, 10) # Tab expanded to spaces + 'a b' + """ + # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements,too-many-nested-blocks,W0101 + start = max(start, 0) + if end <= start: + return '' + + # Fast path: printable ASCII only (no tabs, escape sequences, or wide or zero-width chars) + if text.isascii() and text.isprintable(): + return text[start:end] + + # Fast path: no escape sequences means no SGR tracking needed + if propagate_sgr and '\x1b' not in text: + propagate_sgr = False + + # SGR tracking state (only when propagate_sgr=True) sgr_at_clip_start is + # sgr state when first visible char emitted (None = not yet) + sgr_at_clip_start = None + # current active sgr state + sgr = None # current SGR state, updated by matches of _SGR_PATTERN + if propagate_sgr: + sgr = _SGR_STATE_DEFAULT + + # Painter's algorithm data structures: + # map column integer to a visible character (with its width) + cells: dict[int, tuple[str, int]] = {} + # map column integer to a list of zero-width sequences emitted at that position + # (col, seq_order, text) + sequences: list[tuple[int, int, str]] = [] + # ordering of sequences + seq_order = 0 + + col = 0 + idx = 0 + + def _write_cells(s: str, w: int, write_col: int) -> None: + nonlocal sgr_at_clip_start + if w > 0: + # Fix up wide-char orphans and clear overwritten cells in one pass + for offset in range(w): + src_col = write_col + offset + if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: + cells[src_col - 1] = (fillchar, 1) + if cells.get(src_col, ('', 0))[1] == 2: + cells[src_col + 1] = (fillchar, 1) + cells.pop(src_col, None) + cells[write_col] = (s, w) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + + def _append_seq(seq: str, at_col: int | None = None) -> None: + nonlocal sgr_at_clip_start, seq_order + c = col if at_col is None else at_col + sequences.append((c, seq_order, seq)) + seq_order += 1 + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + + while idx < len(text): + char = text[idx] + + # Early exit: past visible region, SGR captured, no escape ahead + if col >= end and sgr_at_clip_start is not None and char != '\x1b': + break + + # 1. Handle escape sequences and bare ESC — single regex dispatch + if char == '\x1b': + m = _SEQUENCE_CLASSIFY.match(text, idx) + if not m: + _append_seq(char) + idx += 1 + continue + + # Dispatch on which named group captured: + if (m.group('sgr_params')) is not None and (propagate_sgr and sgr): + sgr = _sgr_state_update(sgr, m.group()) + idx = m.end() + continue + + # 1a. Cursor forward, + if (cforward_n := m.group('cforward_n')) is not None: + n_forward = int(cforward_n) if cforward_n else 1 + move_end = col + n_forward + if col < end and move_end > start: + for i in range(max(col, start), min(move_end, end)): + _write_cells(fillchar, 1, i) + col = move_end + idx = m.end() + continue + + # 1b. Cursor backward, + if (cbackward_n := m.group('cbackward_n')) is not None: + n_backward = int(cbackward_n) if cbackward_n else 1 + col = max(0, col - n_backward) + idx = m.end() + continue + + # 1c. OSC 66 Text Sizing + if (ts_meta := m.group('ts_meta')) is not None: + ts_text = m.group('ts_text') + ts_term = m.group('ts_term') + col = _text_sizing_clip( + TextSizing( + TextSizingParams.from_params(ts_meta), + ts_text, + ts_term), + col=col, start=start, end=end, + write_cells=_write_cells, + fillchar=fillchar, ambiguous_width=ambiguous_width, + ) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + idx = m.end() + continue + + # 1d. Any other recognized zero-width sequence + _append_seq(m.group()) + idx = m.end() + continue + + # 2. TAB expansion + if char == '\t': + if tabsize > 0: + next_tab = col + (tabsize - (col % tabsize)) + while col < next_tab: + if start <= col < end: + _write_cells(' ', 1, col) + col += 1 + else: + # preserve tab as-is + _append_seq(char) + idx += 1 + continue + + # 3. Grapheme clustering for everything else + grapheme = next(iter_graphemes(text, start=idx)) + grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) + + if grapheme_w == 0: + # combining/zero-width grapheme; preserve as token at this column + if start <= col < end: + _append_seq(grapheme) + elif col >= start and col + grapheme_w <= end: + # Fully visible + _write_cells(grapheme, grapheme_w, col) + elif col < end and col + grapheme_w > start: + # Partially visible (wide char at boundary) — emit fillchars + clip_start = max(start, col) + for i in range(min(end, col + grapheme_w) - clip_start): + _write_cells(fillchar, 1, clip_start + i) + # advance column whether visible or not + col += grapheme_w + idx += len(grapheme) + + # Reconstruct result from "painter's algorithm", this allows us to + # accurately depict clipping with horizontal movement + seqs_by_col: dict[int, list[tuple[int, str]]] = {} + for col_pos, order, seq_text in sequences: + seqs_by_col.setdefault(col_pos, []).append((order, seq_text)) + for entries in seqs_by_col.values(): + entries.sort() + + max_cell_col = max(cells.keys()) if cells else -1 + max_seq_col = max(seqs_by_col.keys()) if seqs_by_col else -1 + max_col = max(max_cell_col, max_seq_col) + + # Walk columns 0..min(max_col, end), emitting sequences then any cell + # or fillchar occupying each position. Visits *inclusive* of + # min(max_col, end) so sequences at `end` are preserved. + parts: list[str] = [] + walk_col = 0 + col_limit = min(max_col, end) + while walk_col <= col_limit: + # Zero-width sequences at this column + for _, seq_text in seqs_by_col.get(walk_col, ()): + parts.append(seq_text) + + if walk_col >= end: + walk_col += 1 + continue + + if walk_col in cells: + cell_text, cell_w = cells[walk_col] + cell_end = walk_col + cell_w + + if walk_col >= start and cell_end <= end: + # Fully inside clip window + parts.append(cell_text) + elif cell_end > start: + # Partial overlap (wide char split at boundary) + parts.append(fillchar * (min(cell_end, end) - max(walk_col, start))) + walk_col += cell_w + else: + # Hole: emit fillchar for columns inside (start, end) that lie + # within the written cell area + if start <= walk_col <= max_cell_col: + parts.append(fillchar) + walk_col += 1 + + # Trailing sequences past col_limit (SGR resets after short text, etc.) + for c in sorted(seqs_by_col.keys()): + if c > col_limit: + for _, seq_text in seqs_by_col[c]: + parts.append(seq_text) + + result = ''.join(parts) + + # Apply SGR prefix/suffix + if sgr_at_clip_start is not None: + if prefix := _sgr_state_to_sequence(sgr_at_clip_start): + result = prefix + result + if _sgr_state_is_active(sgr_at_clip_start): + result += '\x1b[0m' + + return result + + +def _text_sizing_clip( + ts: TextSizing, + *, + col: int, + start: int, + end: int, + write_cells: Callable[[str, int, int], None], + fillchar: str = ' ', + ambiguous_width: int = 1, +) -> int: + """ + Emit tokens for a text-sizing (OSC 66) sequence, clipped to ``[start, end)``. + + Returns ``new_col`` (column position after the sequence). + """ + # pylint: disable=too-many-locals,too-many-branches,too-complex + ts_width = ts.display_width(ambiguous_width) + + # Sequence fully visible or fully outside: simple cases + if col >= start and col + ts_width <= end: + write_cells(ts.make_sequence(), ts_width, col) + return col + ts_width + if col >= end or col + ts_width <= start: + return col + ts_width + + # Partial overlap: the sequence straddles a clip boundary. + # Decompose into unit cells (each grapheme occupies `scale` cells), + # emit as many whole units as fit inside [start, end), filling the + # remainder with `fillchar`. + rel_start = max(0, start - col) + rel_end = min(end, col + ts_width) - col + scale = ts.params.scale + + # Build the list of (grapheme, cell_width) units + units: list[tuple[str, int]] = [] + if ts.params.width > 0: + # Fixed-width mode: explicit count at `scale` cells each. + # Use itertools.islice to avoid materializing the full grapheme list. + # std imports + for _, g in enumerate(islice(iter_graphemes(ts.text), ts.params.width)): + units.append((g, scale)) + # Pad with empty graphemes if text had fewer than width + for _ in range(ts.params.width - len(units)): + units.append(('', scale)) + else: + # Auto-width mode: grapheme count derived from content, width varies + for g in iter_graphemes(ts.text): + units.append((g, width(g, ambiguous_width=ambiguous_width) * scale)) + + # Batch of consecutive fully-visible units that can be emitted as a + # single text-sizing sequence. + pending_units: list[tuple[str, int]] = [] # (grapheme_text, cell_width) + + def flush(flush_col: int) -> None: + """Emit accumulated graphemes as one text-sizing sequence.""" + if not pending_units: + return + texts = [u[0] for u in pending_units] + total_w = sum(u[1] for u in pending_units) + params = TextSizingParams( + scale, + len(texts) if ts.params.width > 0 else 0, + ts.params.numerator, + ts.params.denominator, + ts.params.vertical_align, + ts.params.horizontal_align) + write_cells( + TextSizing(params, ''.join(texts), ts.terminator).make_sequence(), + total_w, + flush_col) + pending_units.clear() + + # Walk units in cell-coordinate space, collecting consecutive fully-visible + # ones into a batch (flushed as one sequence) and emitting fillchars for + # partial units at the boundaries. + flush_col_pos = col + rel_start + unit_pos = 0 # current position in cell-coordinates within the sequence + for unit_text, unit_w in units: + unit_end = unit_pos + unit_w + if unit_end <= rel_start: + # Unit is entirely before the clip window + unit_pos = unit_end + continue + if unit_pos >= rel_end: + # Unit is entirely past the clip window + break + + overlap = min(unit_end, rel_end) - max(unit_pos, rel_start) + if overlap == unit_w and unit_w > 0: + # Unit fits completely — batch it with others + if not pending_units: + flush_col_pos = col + max(unit_pos, rel_start) + pending_units.append((unit_text, unit_w)) + else: + # Unit is partially clipped — flush batch, emit fillchars for remainder + flush(flush_col_pos) + abs_start = col + max(unit_pos, rel_start) + for i in range(overlap): + write_cells(fillchar, 1, abs_start + i) + unit_pos = unit_end + + flush(flush_col_pos) + return col + ts_width diff --git a/wcwidth/_width.py b/wcwidth/_width.py index 9dc68ed..e9b6032 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -12,15 +12,10 @@ _FITZPATRICK_RANGE, _REGIONAL_INDICATOR_SET) from .table_vs16 import VS16_NARROW_TO_WIDE -from .text_sizing import TextSizing +from .text_sizing import TextSizing, TextSizingParams from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL from .table_grapheme import ISC_CONSONANT -from .escape_sequences import (ZERO_WIDTH_PATTERN, - TEXT_SIZING_PATTERN, - CURSOR_LEFT_SEQUENCE, - CURSOR_RIGHT_SEQUENCE, - INDETERMINATE_EFFECT_SEQUENCE, - strip_sequences) +from .escape_sequences import _SEQUENCE_CLASSIFY, INDETERMINATE_EFFECT_SEQUENCE, strip_sequences # In 'parse' mode, strings longer than this are checked for cursor-movement # controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to @@ -124,11 +119,7 @@ def width( # Check for cursor-affecting control characters if '\b' not in text and '\t' not in text and '\r' not in text: # Check for escape sequences that can't be ignored, if present - if '\x1b' not in text or ( - not CURSOR_RIGHT_SEQUENCE.search(text) and - not CURSOR_LEFT_SEQUENCE.search(text) and - not TEXT_SIZING_PATTERN.search(text) - ): + if '\x1b' not in text or not _SEQUENCE_CLASSIFY.search(text): control_codes = 'ignore' # Fast path for ignore mode, useful if you know the text is already free of control codes @@ -155,32 +146,36 @@ def width( while idx < text_len: char = text[idx] - # 1. Handle ESC sequences + # 1. ESC sequences if char == '\x1b': - # Check for all terminal sequences - if (match := ZERO_WIDTH_PATTERN.match(text, idx)): - seq = match.group() + m = _SEQUENCE_CLASSIFY.match(text, idx) + if not m: + # 1a. Errant ESC or unknown sequence: only the first character is zero-width + idx += 1 + else: + seq = m.group() if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): - raise ValueError(f"Indeterminate cursor sequence at position {idx}") - - # Apply cursor movement, - if (right := CURSOR_RIGHT_SEQUENCE.match(seq)): - current_col += int(right.group(1) or 1) - elif (left := CURSOR_LEFT_SEQUENCE.match(seq)): - current_col = max(0, current_col - int(left.group(1) or 1)) - - # Or OSC 66 (kitty text sizing) - elif (ts_match := TEXT_SIZING_PATTERN.match(seq)): - text_size = TextSizing.from_match(ts_match, control_codes=control_codes) + raise ValueError(f"Indeterminate cursor sequence at position {idx}, {seq!r}") + + # 2b. cursor forward, backward, and OSC 66 text sizing width + if (cforward_n := m.group('cforward_n')) is not None: + current_col += int(cforward_n) if cforward_n else 1 + elif (cbackward_n := m.group('cbackward_n')) is not None: + current_col = max(0, current_col - (int(cbackward_n) if cbackward_n else 1)) + elif (ts_meta := m.group('ts_meta')) is not None: + ts_text = m.group('ts_text') + ts_term = m.group('ts_term') + assert ts_text is not None and ts_term is not None + text_size = TextSizing( + TextSizingParams.from_params(ts_meta, control_codes=control_codes), + ts_text, ts_term) current_col += text_size.display_width(ambiguous_width) - idx = match.end() - else: - # Errant ESC or unknown sequence: only the first character is zero-width - idx += 1 + # 2c. SGR and other zero-width sequences -- no column advance + idx = m.end() max_extent = max(max_extent, current_col) continue - # 2. Handle illegal and vertical control characters (zero width, error in strict) + # 2. Vertical or Illegal control characters zero width or error when 'strict' if char in ILLEGAL_CTRL: if strict: raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}") @@ -193,7 +188,7 @@ def width( idx += 1 continue - # 3. Handle horizontal movement characters + # 3. Horizontal movement characters if char in HORIZONTAL_CTRL: if char == '\x09' and tabsize > 0: # Tab current_col += tabsize - (current_col % tabsize) @@ -206,7 +201,7 @@ def width( idx += 1 continue - # 4. Handle ZWJ + # 4. Zero-Width Joiner (ZWJ) if char == '\u200D': if last_was_virama: # ZWJ after virama requests explicit half-form rendering but @@ -222,14 +217,14 @@ def width( last_was_virama = False continue - # 5. Handle other zero-width characters (control chars) + # 5. Other zero-width characters (control chars) if char in ZERO_WIDTH_CTRL: idx += 1 continue ucs = ord(char) - # 6. Handle VS16: converts preceding narrow character to wide + # 6. VS16: converts preceding narrow character to wide if ucs == 0xFE0F: if last_measured_idx == idx - 1: if bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE["9.0.0"]): diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index e9d5734..acd7e8e 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -11,6 +11,9 @@ import typing +# local +from .sgr_state import _SGR_PATTERN + # Text Sizing Protocol (OSC 66) — has positive width, must be checked before ZERO_WIDTH_PATTERN. # Groups: (1) metadata, (2) inner text, (3) terminator (BEL or ST). # https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ @@ -46,6 +49,18 @@ # Cursor left movement: CSI [n] D, parameter may be parsed by width() CURSOR_LEFT_SEQUENCE = re.compile(r'\x1b\[(\d*)D') +# Combined pattern: a single regex that matches any zero-width escape sequence +# and classifies it via named groups, aprox 2x faster than redundant re.matches +# in clip() and width(). +_SEQUENCE_CLASSIFY = re.compile( + _SGR_PATTERN.pattern.replace('(', '(?P', 1) + + '|' + CURSOR_RIGHT_SEQUENCE.pattern.replace('(', '(?P', 1) + + '|' + CURSOR_LEFT_SEQUENCE.pattern.replace('(', '(?P', 1) + + '|' + + r'\x1b\]66;(?P[^;\x07\x1b]*);(?P[^\x07\x1b]*)(?P\x07|\x1b\\)' + + '|' + r'(?P(?:' + ZERO_WIDTH_PATTERN.pattern + '))' +) + # Indeterminate effect sequences - raise ValueError in 'strict' mode. The effects of these sequences # are likely to be undesirable, moving the cursor vertically or to any unknown position, and # otherwise not managed by the 'width' method of this library. diff --git a/wcwidth/table_ambiguous.py b/wcwidth/table_ambiguous.py index 2c40498..d2fdd6b 100644 --- a/wcwidth/table_ambiguous.py +++ b/wcwidth/table_ambiguous.py @@ -1,9 +1,8 @@ """ Exports AMBIGUOUS_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. +This code generated by python wcwidth project. """ - # pylint: disable=duplicate-code AMBIGUOUS_EASTASIAN = { '17.0.0': ( diff --git a/wcwidth/table_grapheme.py b/wcwidth/table_grapheme.py index d265b66..563792a 100644 --- a/wcwidth/table_grapheme.py +++ b/wcwidth/table_grapheme.py @@ -4,9 +4,8 @@ This module provides lookup tables for Unicode grapheme cluster break properties as defined in UAX #29: Unicode Text Segmentation. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by python wcwidth project. """ - # pylint: disable=duplicate-code GRAPHEME_CR = ( diff --git a/wcwidth/table_mc.py b/wcwidth/table_mc.py index 63acce9..663e93b 100644 --- a/wcwidth/table_mc.py +++ b/wcwidth/table_mc.py @@ -1,9 +1,8 @@ """ Exports CATEGORY_MC table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by python wcwidth project. """ - # pylint: disable=duplicate-code CATEGORY_MC = { '17.0.0': ( diff --git a/wcwidth/table_vs16.py b/wcwidth/table_vs16.py index a5fc0a8..9420156 100644 --- a/wcwidth/table_vs16.py +++ b/wcwidth/table_vs16.py @@ -1,9 +1,8 @@ """ Exports VS16_NARROW_TO_WIDE table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. +This code generated by python wcwidth project. """ - # pylint: disable=duplicate-code VS16_NARROW_TO_WIDE = { '9.0.0': ( diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py index 9d2ebd5..4ad7bc1 100644 --- a/wcwidth/table_wide.py +++ b/wcwidth/table_wide.py @@ -1,9 +1,8 @@ """ Exports WIDE_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by python wcwidth project. """ - # pylint: disable=duplicate-code WIDE_EASTASIAN = { '17.0.0': ( diff --git a/wcwidth/table_zero.py b/wcwidth/table_zero.py index b669f70..bee2431 100644 --- a/wcwidth/table_zero.py +++ b/wcwidth/table_zero.py @@ -1,9 +1,8 @@ """ Exports ZERO_WIDTH table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by python wcwidth project. """ - # pylint: disable=duplicate-code ZERO_WIDTH = { '17.0.0': ( diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py index 451b747..3fc22b9 100644 --- a/wcwidth/text_sizing.py +++ b/wcwidth/text_sizing.py @@ -25,7 +25,7 @@ .. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ -.. versionadded:: 0.6.1 +.. versionadded:: 0.7.0 """ from __future__ import annotations diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index a2fd1bc..ebdb63c 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -64,89 +64,12 @@ # std imports from functools import lru_cache -from typing import Callable, Union, Literal, NamedTuple +from typing import Literal # local -# pylint: disable=unused-import -# Some CONSTANTS imported are now unused, like _wcversion_value(), they were first defined in this -# file location, and remain there for API compatibility purposes _wcversion_value and -# _wcmatch_version are no longer used internally since version 0.5.0 (only the latest Unicode -# version is shipped), and many global constants, now unused here, were moved to _constants.py in -# version 0.6.1. -# -# They are retained for API compatibility with external tools like ucs-detect -# that may use these private functions. -# from ._width import width from ._wcwidth import wcwidth -from .bisearch import bisearch as _bisearch -from .grapheme import iter_graphemes -from ._wcswidth import wcswidth -from .sgr_state import (_SGR_PATTERN, - _SGR_STATE_DEFAULT, - _sgr_state_update, - _sgr_state_is_active, - _sgr_state_to_sequence) from ._constants import _LATEST_VERSION -from .table_vs16 import VS16_NARROW_TO_WIDE -from .table_wide import WIDE_EASTASIAN -from .table_zero import ZERO_WIDTH -from .text_sizing import TextSizing, TextSizingParams -from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL -from .table_grapheme import ISC_CONSONANT -from .table_ambiguous import AMBIGUOUS_EASTASIAN -from .escape_sequences import (ZERO_WIDTH_PATTERN, - TEXT_SIZING_PATTERN, - CURSOR_LEFT_SEQUENCE, - CURSOR_RIGHT_SEQUENCE, - INDETERMINATE_EFFECT_SEQUENCE, - iter_sequences, - strip_sequences) -from .unicode_versions import list_versions - -# Token types for output_tokens used by clip(). -# NamedTuple subclasses provide named attribute access while remaining -# plain tuples at runtime — zero overhead over the old bare-tuple approach, -# but with isinstance() type discrimination and meaningful attribute names. - - -class VisToken(NamedTuple): - """A visible text segment with its display width and starting column.""" - - text: str - width: int - start_col: int - - -class SeqToken(NamedTuple): - """A zero-width terminal sequence (escape sequences, control chars, etc.).""" - - text: str - - -Token = Union[VisToken, SeqToken] - -# Unlike wcwidth.__all__, wcwidth.wcwidth.__all__ is NOT for the purpose of defining a public API, -# or what we prefer to be imported with statement, "from wcwidth.wcwidth import *". Explicitly -# re-export imports here for no other reason than to satisfy the type checkers (mypy). Yak shavings. -__all__ = ( - 'ZERO_WIDTH', - 'WIDE_EASTASIAN', - 'AMBIGUOUS_EASTASIAN', - 'VS16_NARROW_TO_WIDE', - 'list_versions', - 'wcwidth', - 'wcswidth', - 'width', - 'iter_sequences', - 'ljust', - 'rjust', - 'center', - 'clip', - 'strip_sequences', - '_wcmatch_version', - '_wcversion_value', -) @lru_cache(maxsize=128) @@ -311,390 +234,3 @@ def center( right_pad = total_padding - left_pad return fillchar * left_pad + text + fillchar * right_pad - -def clip( - text: str, - start: int, - end: int, - *, - fillchar: str = ' ', - tabsize: int = 8, - ambiguous_width: int = 1, - propagate_sgr: bool = True, -) -> str: - r""" - Clip text to display columns ``(start, end)`` while preserving all terminal sequences. - - This function extracts a substring based on visible column positions rather than - character indices. Terminal escape sequences are preserved in the output since - they have zero display width. If a wide character (width 2) would be split at - either boundary, it is replaced with ``fillchar``. - - TAB characters (``\t``) are expanded to spaces up to the next tab stop, - controlled by the ``tabsize`` parameter. - - Other cursor movement characters (backspace, carriage return) and cursor - movement sequences are passed through unchanged as zero-width. - - :param text: String to clip, may contain terminal escape sequences. - :param start: Absolute starting column (inclusive, 0-indexed). - :param end: Absolute ending column (exclusive). - :param fillchar: Character to use when a wide character must be split at - a boundary (default space). Must have display width of 1. - :param tabsize: Tab stop width (default 8). Set to 0 to pass tabs through - as zero-width (preserved in output but don't advance column position). - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :param propagate_sgr: If True (default), SGR (terminal styling) sequences - are propagated. The result begins with any active style at the start - position and ends with a reset sequence if styles are active. - :returns: Substring of ``text`` spanning display columns ``(start, end)``, - with all terminal sequences preserved and wide characters at boundaries - replaced with ``fillchar``. - - SGR (terminal styling) sequences are propagated by default. The result - begins with any active style and ends with a reset:: - - >>> clip('\x1b[1;34mHello world\x1b[0m', 6, 11) - '\x1b[1;34mworld\x1b[0m' - - Set ``propagate_sgr=False`` to disable this behavior. - - .. versionadded:: 0.3.0 - - .. versionchanged:: 0.5.0 - Added ``propagate_sgr`` parameter (default True). - - .. versionchanged:: 0.6.1 - Parses OSC 66 Sequences. - - Example:: - - >>> clip('hello world', 0, 5) - 'hello' - >>> clip('中文字', 0, 3) # Wide char split at column 3 - '中 ' - >>> clip('a\tb', 0, 10) # Tab expanded to spaces - 'a b' - """ - # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements,too-many-nested-blocks,W0101 - # Again, for 'hot path', we avoid additional delegate functions and accept the cost - # of complexity for improved python performance. - start = max(start, 0) - if end <= start: - return '' - - # Fast path: printable ASCII only (no tabs, escape sequences, or wide or zero-width chars) - if text.isascii() and text.isprintable(): - return text[start:end] - - # Fast path: no escape sequences means no SGR tracking needed - if propagate_sgr and '\x1b' not in text: - propagate_sgr = False - - # SGR tracking state (only when propagate_sgr=True) - # sgr_at_clip_start is sgr state when first visible char emitted (None = not yet) - sgr_at_clip_start = None - # current active sgr state - sgr = None # current SGR state, updated by matches of _SGR_PATTERN - if propagate_sgr: - sgr = _SGR_STATE_DEFAULT - - # Painter's algorithm data structures: - # 1. cells: maps column integer to a visible character (with its width) - # cells that are part of a wide character's right half are not populated. - # 2. sequences: maps column integer to a list of zero-width sequences emitted at that position - # and their chronological order number. - cells: dict[int, tuple[str, int]] = {} - sequences: list[tuple[int, int, str]] = [] # (col, seq_order, text) - seq_order = 0 # relative ordering of sequences - - col = 0 - idx = 0 - - def _write_cells(s: str, w: int, write_col: int) -> None: - nonlocal sgr_at_clip_start - if w > 0: - # Fix up wide-char orphans and clear overwritten cells in one pass - for offset in range(w): - src_col = write_col + offset - if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: - cells[src_col - 1] = (fillchar, 1) - if cells.get(src_col, ('', 0))[1] == 2: - cells[src_col + 1] = (fillchar, 1) - cells.pop(src_col, None) - cells[write_col] = (s, w) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - - def _append_seq(seq: str, at_col: int | None = None) -> None: - nonlocal sgr_at_clip_start, seq_order - c = col if at_col is None else at_col - sequences.append((c, seq_order, seq)) - seq_order += 1 - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - - while idx < len(text): - char = text[idx] - - # Early exit: past visible region, SGR captured, no escape ahead - if col >= end and sgr_at_clip_start is not None and char != '\x1b': - break - - # 1. Handle escape sequences and bare ESC - if char == '\x1b': - if not (match := ZERO_WIDTH_PATTERN.match(text, idx)): - # Bare ESC not matching any recognized sequence pattern - _append_seq(char) - idx += 1 - continue - - seq = match.group() - if (propagate_sgr and sgr) and _SGR_PATTERN.match(seq): - # Update SGR state; will be applied as prefix when visible content starts - sgr = _sgr_state_update(sgr, seq) - idx = match.end() - continue - - # Cursor-forward sequences (e.g. CSI n C) advance the column; - # simulate by emitting fillchars for the visible portion. - if (match_cforward := CURSOR_RIGHT_SEQUENCE.match(seq)): - digit_txt = match_cforward.group(1) - n_forward = int(digit_txt) if digit_txt else 1 - move_end = col + n_forward - if col < end and move_end > start: - for i in range(max(col, start), min(move_end, end)): - _write_cells(fillchar, 1, i) - col = move_end - idx = match.end() - continue - - # Cursor-backward sequences (e.g. CSI n D) retreat the column. - if (match_cbackward := CURSOR_LEFT_SEQUENCE.match(seq)): - digit_txt = match_cbackward.group(1) - n_backward = int(digit_txt) if digit_txt else 1 - col = max(0, col - n_backward) - idx = match.end() - continue - - if (ts_match := TEXT_SIZING_PATTERN.match(seq)): - # OSC 66 (text sizing) has positive width - col = _text_sizing_clip( - TextSizing.from_match(ts_match), - col=col, start=start, end=end, - write_cells=_write_cells, - fillchar=fillchar, ambiguous_width=ambiguous_width, - ) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - idx = match.end() - continue - - # Other zero-width sequences (OSC hyperlinks, etc.) are preserved as-is - _append_seq(seq) - idx = match.end() - continue - - # 3. TAB expansion - if char == '\t': - if tabsize > 0: - next_tab = col + (tabsize - (col % tabsize)) - while col < next_tab: - if start <= col < end: - _write_cells(' ', 1, col) - col += 1 - else: - # preserve tab as-is - _append_seq(char) - idx += 1 - continue - - # 4. Grapheme clustering for everything else - grapheme = next(iter_graphemes(text, start=idx)) - grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) - - if grapheme_w == 0: - # combining/zero-width grapheme; preserve as token at this column - if start <= col < end: - _append_seq(grapheme) - elif col >= start and col + grapheme_w <= end: - # Fully visible - _write_cells(grapheme, grapheme_w, col) - elif col < end and col + grapheme_w > start: - # Partially visible (wide char at boundary) — emit fillchars - clip_start = max(start, col) - for i in range(min(end, col + grapheme_w) - clip_start): - _write_cells(fillchar, 1, clip_start + i) - # advance column whether visible or not - col += grapheme_w - idx += len(grapheme) - - # ── Reconstruct result from painter's algorithm grid ────────────────── - # Build column→sorted sequences index - seqs_by_col: dict[int, list[tuple[int, str]]] = {} - for col_pos, order, seq_text in sequences: - seqs_by_col.setdefault(col_pos, []).append((order, seq_text)) - for entries in seqs_by_col.values(): - entries.sort() - - max_cell_col = max(cells.keys()) if cells else -1 - max_seq_col = max(seqs_by_col.keys()) if seqs_by_col else -1 - max_col = max(max_cell_col, max_seq_col) - - # Walk columns 0..min(max_col, end), emitting sequences then any cell - # or fillchar occupying each position. Visits *inclusive* of - # min(max_col, end) so sequences at `end` are preserved. - parts: list[str] = [] - walk_col = 0 - col_limit = min(max_col, end) - while walk_col <= col_limit: - # Zero-width sequences at this column - for _, seq_text in seqs_by_col.get(walk_col, ()): - parts.append(seq_text) - - if walk_col >= end: - walk_col += 1 - continue - - if walk_col in cells: - cell_text, cell_w = cells[walk_col] - cell_end = walk_col + cell_w - - if walk_col >= start and cell_end <= end: - # Fully inside clip window - parts.append(cell_text) - elif cell_end > start: - # Partial overlap (wide char split at boundary) - parts.append(fillchar * (min(cell_end, end) - max(walk_col, start))) - # else: cell entirely before start — skip - - walk_col += cell_w - else: - # Hole: emit fillchar for columns inside [start, end) that - # lie within the written cell area - if start <= walk_col <= max_cell_col: - parts.append(fillchar) - walk_col += 1 - - # Trailing sequences past col_limit (SGR resets after short text, etc.) - for c in sorted(seqs_by_col.keys()): - if c > col_limit: - for _, seq_text in seqs_by_col[c]: - parts.append(seq_text) - - result = ''.join(parts) - - # Apply SGR prefix/suffix - if sgr_at_clip_start is not None: - if prefix := _sgr_state_to_sequence(sgr_at_clip_start): - result = prefix + result - if _sgr_state_is_active(sgr_at_clip_start): - result += '\x1b[0m' - - return result - - -def _text_sizing_clip( - ts: TextSizing, - *, - col: int, - start: int, - end: int, - write_cells: Callable[[str, int, int], None], - fillchar: str = ' ', - ambiguous_width: int = 1, -) -> int: - """ - Emit tokens for a text-sizing (OSC 66) sequence, clipped to ``[start, end)``. - - Returns ``new_col`` (column position after the sequence). - """ - # pylint: disable=too-many-locals - ts_width = ts.display_width(ambiguous_width) - - # Sequence fully visible or fully outside: simple cases - if col >= start and col + ts_width <= end: - write_cells(ts.make_sequence(), ts_width, col) - return col + ts_width - if col >= end or col + ts_width <= start: - return col + ts_width - - # Partial overlap: the sequence straddles a clip boundary. - # Decompose into unit cells (each grapheme occupies `scale` cells), - # emit as many whole units as fit inside [start, end), filling the - # remainder with `fillchar`. - rel_start = max(0, start - col) - rel_end = min(end, col + ts_width) - col - scale = ts.params.scale - - # Build the list of (grapheme, cell_width) units - units: list[tuple[str, int]] = [] - if ts.params.width > 0: - # Fixed-width mode: explicit count at `scale` cells each. - # Use itertools.islice to avoid materializing the full grapheme list. - from itertools import islice - for _, g in enumerate(islice(iter_graphemes(ts.text), ts.params.width)): - units.append((g, scale)) - # Pad with empty graphemes if text had fewer than width - for _ in range(ts.params.width - len(units)): - units.append(('', scale)) - else: - # Auto-width mode: grapheme count derived from content, width varies - for g in iter_graphemes(ts.text): - units.append((g, width(g, ambiguous_width=ambiguous_width) * scale)) - - # Batch of consecutive fully-visible units that can be emitted as a - # single text-sizing sequence. - pending_units: list[tuple[str, int]] = [] # (grapheme_text, cell_width) - - def flush(flush_col: int) -> None: - """Emit accumulated graphemes as one text-sizing sequence.""" - if not pending_units: - return - texts = [u[0] for u in pending_units] - total_w = sum(u[1] for u in pending_units) - params = TextSizingParams( - scale, - len(texts) if ts.params.width > 0 else 0, - ts.params.numerator, - ts.params.denominator, - ts.params.vertical_align, - ts.params.horizontal_align) - write_cells( - TextSizing(params, ''.join(texts), ts.terminator).make_sequence(), - total_w, - flush_col) - pending_units.clear() - - # Walk units in cell-coordinate space, collecting consecutive fully-visible - # ones into a batch (flushed as one sequence) and emitting fillchars for - # partial units at the boundaries. - flush_col_pos = col + rel_start - unit_pos = 0 # current position in cell-coordinates within the sequence - for unit_text, unit_w in units: - unit_end = unit_pos + unit_w - if unit_end <= rel_start: - # Unit is entirely before the clip window - unit_pos = unit_end - continue - if unit_pos >= rel_end: - # Unit is entirely past the clip window - break - - overlap = min(unit_end, rel_end) - max(unit_pos, rel_start) - if overlap == unit_w and unit_w > 0: - # Unit fits completely — batch it with others - if not pending_units: - flush_col_pos = col + max(unit_pos, rel_start) - pending_units.append((unit_text, unit_w)) - else: - # Unit is partially clipped — flush batch, emit fillchars for remainder - flush(flush_col_pos) - abs_start = col + max(unit_pos, rel_start) - for i in range(overlap): - write_cells(fillchar, 1, abs_start + i) - unit_pos = unit_end - - flush(flush_col_pos) - return col + ts_width From 195a81d2d48c7d3a76d8f6ceb5efb1441f817f4d Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 02:52:43 -0400 Subject: [PATCH 35/70] more refactor/moving of files --- docs/intro.rst | 3 +- tests/test_core.py | 2 +- wcwidth/__init__.py | 14 +-- wcwidth/_wcwidth.py | 64 ---------- wcwidth/align.py | 134 +++++++++++++++++++++ wcwidth/{_clip.py => clip.py} | 6 +- wcwidth/text_sizing.py | 2 +- wcwidth/textwrap.py | 2 +- wcwidth/{_wcswidth.py => wcswidth.py} | 2 +- wcwidth/wcwidth.py | 166 +++++++------------------- wcwidth/{_width.py => width.py} | 4 +- 11 files changed, 195 insertions(+), 204 deletions(-) delete mode 100644 wcwidth/_wcwidth.py create mode 100644 wcwidth/align.py rename wcwidth/{_clip.py => clip.py} (99%) rename wcwidth/{_wcswidth.py => wcswidth.py} (99%) rename wcwidth/{_width.py => width.py} (99%) diff --git a/docs/intro.rst b/docs/intro.rst index 808fad7..3736f42 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -470,7 +470,8 @@ History 0.7.0 *2026-04-30* * **Improved** `clip()` to support backward cursor sequence overwrite, "Painter's algorithm". - * **Improved** `width()` and `clip()` to support parsing of `kitty text sizing protocol`_ (OSC 66). + * **Improved** `width()` and `clip()` to support parsing of `kitty text + sizing protocol`_ (OSC 66). 0.6.0 *2026-02-06* * **New** Parameters ``expand_tabs``, ``replace_whitespace``, ``fix_sentence_endings``, diff --git a/tests/test_core.py b/tests/test_core.py index 01c9fb5..f825050 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -9,7 +9,7 @@ # local import wcwidth -from wcwidth._width import _WIDTH_FAST_PATH_MIN_LEN +from wcwidth.width import _WIDTH_FAST_PATH_MIN_LEN _wcwidth_module = sys.modules['wcwidth.wcwidth'] # local diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index dcece0a..175c1f7 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -9,24 +9,22 @@ # documented as public API # local -from ._clip import clip -from ._align import ljust, rjust, center -from ._width import width -# legacy -from .unicode_versions import list_versions -from ._wcwidth import wcwidth +from .clip import clip +from .align import ljust, rjust, center +from .width import width +from .wcwidth import wcwidth, _wcmatch_version, _wcversion_value from .bisearch import bisearch as _bisearch from .grapheme import iter_graphemes, iter_graphemes_reverse, grapheme_boundary_before from .textwrap import SequenceTextWrapper, wrap -from ._wcswidth import wcswidth +from .wcswidth import wcswidth from .sgr_state import propagate_sgr from .table_vs16 import VS16_NARROW_TO_WIDE from .table_wide import WIDE_EASTASIAN -# convenience from .table_zero import ZERO_WIDTH from .text_sizing import TextSizing, TextSizingParams from .table_ambiguous import AMBIGUOUS_EASTASIAN from .escape_sequences import iter_sequences, strip_sequences +from .unicode_versions import list_versions # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". diff --git a/wcwidth/_wcwidth.py b/wcwidth/_wcwidth.py deleted file mode 100644 index 9e72418..0000000 --- a/wcwidth/_wcwidth.py +++ /dev/null @@ -1,64 +0,0 @@ -"""This is a python implementation of wcwidth().""" - -# std -# std imports -from functools import lru_cache - -# local -from .bisearch import bisearch -from ._constants import _AMBIGUOUS_TABLE, _ZERO_WIDTH_TABLE, _WIDE_EASTASIAN_TABLE - -# maxsize=1024: western scripts need ~64 unique codepoints per session, but -# CJK sessions may use ~2000 of ~3500 common hanzi/kanji. 1024 accommodates -# heavy CJK use. Performance floor at 32; bisearch is ~100ns per miss. - - -@lru_cache(maxsize=1024) -def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> int: # pylint: disable=unused-argument - r""" - Given one Unicode codepoint, return its printable length on a terminal. - - :param wc: A single Unicode character. - :param unicode_version: Ignored. Retained for backwards compatibility. - - .. deprecated:: 0.3.0 - Only the latest Unicode version is now shipped. - - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts - where ambiguous characters display as double-width. See - :ref:`ambiguous_width` for details. - :returns: The width, in cells, necessary to display the character of - Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has - no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is - not printable, or has an indeterminate effect on the terminal, such as - a control character. Otherwise, the number of column positions the - character occupies on a graphic terminal (1 or 2) is returned. - - See :ref:`Specification` for details of cell measurement. - """ - ucs = ord(wc) if wc else 0 - - # small optimization: early return of 1 for printable ASCII, this provides - # approximately 40% performance improvement for mostly-ascii documents, with - # less than 1% impact to others. - if 32 <= ucs < 0x7f: - return 1 - - # C0/C1 control characters are -1 for compatibility with POSIX-like calls - if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0: - return -1 - - # Zero width - if bisearch(ucs, _ZERO_WIDTH_TABLE): - return 0 - - # Wide (F/W categories) - if bisearch(ucs, _WIDE_EASTASIAN_TABLE): - return 2 - - # Ambiguous width (A category) - only when ambiguous_width=2 - if ambiguous_width == 2 and bisearch(ucs, _AMBIGUOUS_TABLE): - return 2 - - return 1 diff --git a/wcwidth/align.py b/wcwidth/align.py new file mode 100644 index 0000000..ddb6886 --- /dev/null +++ b/wcwidth/align.py @@ -0,0 +1,134 @@ +"""Python grapheme, emoji, and sequence-aware ljust, rjust, center().""" +from typing import Literal + +# local +from .width import width + + +def ljust( + text: str, + dest_width: int, + fillchar: str = ' ', + *, + control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', + ambiguous_width: int = 1, +) -> str: + r""" + Return text left-justified in a string of given display width. + + :param text: String to justify, may contain terminal sequences. + :param dest_width: Total display width of result in terminal cells. + :param fillchar: Single character for padding (default space). Must have + display width of 1 (not wide, not zero-width, not combining). Unicode + characters like ``'·'`` are acceptable. The width is not validated. + :param control_codes: How to handle control sequences when measuring. + Passed to :func:`width` for measurement. + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :returns: Text padded on the right to reach ``dest_width``. + + .. versionadded:: 0.3.0 + + Example:: + + >>> wcwidth.ljust('hi', 5) + 'hi ' + >>> wcwidth.ljust('\x1b[31mhi\x1b[0m', 5) + '\x1b[31mhi\x1b[0m ' + >>> wcwidth.ljust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) + '👨‍👩‍👧 ' + """ + if text.isascii() and text.isprintable(): + text_width = len(text) + else: + text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) + padding_cells = max(0, dest_width - text_width) + return text + fillchar * padding_cells + + +def rjust( + text: str, + dest_width: int, + fillchar: str = ' ', + *, + control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', + ambiguous_width: int = 1, +) -> str: + r""" + Return text right-justified in a string of given display width. + + :param text: String to justify, may contain terminal sequences. + :param dest_width: Total display width of result in terminal cells. + :param fillchar: Single character for padding (default space). Must have + display width of 1 (not wide, not zero-width, not combining). Unicode + characters like ``'·'`` are acceptable. The width is not validated. + :param control_codes: How to handle control sequences when measuring. + Passed to :func:`width` for measurement. + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :returns: Text padded on the left to reach ``dest_width``. + + .. versionadded:: 0.3.0 + + Example:: + + >>> wcwidth.rjust('hi', 5) + ' hi' + >>> wcwidth.rjust('\x1b[31mhi\x1b[0m', 5) + ' \x1b[31mhi\x1b[0m' + >>> wcwidth.rjust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) + ' 👨‍👩‍👧' + """ + if text.isascii() and text.isprintable(): + text_width = len(text) + else: + text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) + padding_cells = max(0, dest_width - text_width) + return fillchar * padding_cells + text + + +def center( + text: str, + dest_width: int, + fillchar: str = ' ', + *, + control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', + ambiguous_width: int = 1, +) -> str: + r""" + Return text centered in a string of given display width. + + :param text: String to center, may contain terminal sequences. + :param dest_width: Total display width of result in terminal cells. + :param fillchar: Single character for padding (default space). Must have + display width of 1 (not wide, not zero-width, not combining). Unicode + characters like ``'·'`` are acceptable. The width is not validated. + :param control_codes: How to handle control sequences when measuring. + Passed to :func:`width` for measurement. + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :returns: Text padded on both sides to reach ``dest_width``. + + For odd-width padding, the extra cell goes on the right (matching + Python's :meth:`str.center` behavior). + + .. versionadded:: 0.3.0 + + Example:: + + >>> wcwidth.center('hi', 6) + ' hi ' + >>> wcwidth.center('\x1b[31mhi\x1b[0m', 6) + ' \x1b[31mhi\x1b[0m ' + >>> wcwidth.center('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) + ' 👨‍👩‍👧 ' + """ + if text.isascii() and text.isprintable(): + text_width = len(text) + else: + text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) + total_padding = max(0, dest_width - text_width) + # matching https://jazcap53.github.io/pythons-eccentric-strcenter.html + left_pad = total_padding // 2 + (total_padding & dest_width & 1) + right_pad = total_padding - left_pad + return fillchar * left_pad + text + fillchar * right_pad diff --git a/wcwidth/_clip.py b/wcwidth/clip.py similarity index 99% rename from wcwidth/_clip.py rename to wcwidth/clip.py index f443339..00f0660 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/clip.py @@ -5,9 +5,9 @@ from typing import Union, Callable, NamedTuple # local +from .width import width from .grapheme import iter_graphemes -from .sgr_state import (_SGR_PATTERN, - _SGR_STATE_DEFAULT, +from .sgr_state import (_SGR_STATE_DEFAULT, _sgr_state_update, _sgr_state_is_active, _sgr_state_to_sequence) @@ -114,7 +114,7 @@ def clip( # sgr state when first visible char emitted (None = not yet) sgr_at_clip_start = None # current active sgr state - sgr = None # current SGR state, updated by matches of _SGR_PATTERN + sgr = None # current SGR state, updated by SGR matches if propagate_sgr: sgr = _SGR_STATE_DEFAULT diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py index 3fc22b9..5a930a3 100644 --- a/wcwidth/text_sizing.py +++ b/wcwidth/text_sizing.py @@ -36,7 +36,7 @@ import typing # local -from ._wcswidth import wcswidth +from .wcswidth import wcswidth class _FieldMeta(typing.NamedTuple): diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index f56b4a2..e0f360d 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -15,7 +15,7 @@ from typing import TYPE_CHECKING, NamedTuple # local -from ._width import width as wcwidth_width +from .width import width as wcwidth_width from .grapheme import iter_graphemes from .sgr_state import propagate_sgr as _propagate_sgr from .escape_sequences import ZERO_WIDTH_PATTERN, iter_sequences diff --git a/wcwidth/_wcswidth.py b/wcwidth/wcswidth.py similarity index 99% rename from wcwidth/_wcswidth.py rename to wcwidth/wcswidth.py index 423a6af..14b3250 100644 --- a/wcwidth/_wcswidth.py +++ b/wcwidth/wcswidth.py @@ -3,7 +3,7 @@ import typing # local -from ._wcwidth import wcwidth +from .wcwidth import wcwidth from .bisearch import bisearch from ._constants import (_EMOJI_ZWJ_SET, _ISC_VIRAMA_SET, diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index ebdb63c..c055fb7 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -64,12 +64,9 @@ # std imports from functools import lru_cache -from typing import Literal - # local -from ._width import width -from ._wcwidth import wcwidth -from ._constants import _LATEST_VERSION +from .bisearch import bisearch +from ._constants import _LATEST_VERSION, _AMBIGUOUS_TABLE, _ZERO_WIDTH_TABLE, _WIDE_EASTASIAN_TABLE @lru_cache(maxsize=128) @@ -106,131 +103,56 @@ def _wcmatch_version(given_version: str) -> str: # pylint: disable=unused-argum return _LATEST_VERSION -def ljust( - text: str, - dest_width: int, - fillchar: str = ' ', - *, - control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', - ambiguous_width: int = 1, -) -> str: - r""" - Return text left-justified in a string of given display width. - - :param text: String to justify, may contain terminal sequences. - :param dest_width: Total display width of result in terminal cells. - :param fillchar: Single character for padding (default space). Must have - display width of 1 (not wide, not zero-width, not combining). Unicode - characters like ``'·'`` are acceptable. The width is not validated. - :param control_codes: How to handle control sequences when measuring. - Passed to :func:`width` for measurement. - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :returns: Text padded on the right to reach ``dest_width``. - - .. versionadded:: 0.3.0 - - Example:: +# maxsize=1024: western scripts need ~64 unique codepoints per session, but +# CJK sessions may use ~2000 of ~3500 common hanzi/kanji. 1024 accommodates +# heavy CJK use. Performance floor at 32; bisearch is ~100ns per miss. - >>> wcwidth.ljust('hi', 5) - 'hi ' - >>> wcwidth.ljust('\x1b[31mhi\x1b[0m', 5) - '\x1b[31mhi\x1b[0m ' - >>> wcwidth.ljust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) - '👨‍👩‍👧 ' - """ - if text.isascii() and text.isprintable(): - text_width = len(text) - else: - text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) - padding_cells = max(0, dest_width - text_width) - return text + fillchar * padding_cells - - -def rjust( - text: str, - dest_width: int, - fillchar: str = ' ', - *, - control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', - ambiguous_width: int = 1, -) -> str: +@lru_cache(maxsize=1024) +def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> int: # pylint: disable=unused-argument r""" - Return text right-justified in a string of given display width. - - :param text: String to justify, may contain terminal sequences. - :param dest_width: Total display width of result in terminal cells. - :param fillchar: Single character for padding (default space). Must have - display width of 1 (not wide, not zero-width, not combining). Unicode - characters like ``'·'`` are acceptable. The width is not validated. - :param control_codes: How to handle control sequences when measuring. - Passed to :func:`width` for measurement. - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :returns: Text padded on the left to reach ``dest_width``. + Given one Unicode codepoint, return its printable length on a terminal. - .. versionadded:: 0.3.0 + :param wc: A single Unicode character. + :param unicode_version: Ignored. Retained for backwards compatibility. - Example:: + .. deprecated:: 0.3.0 + Only the latest Unicode version is now shipped. - >>> wcwidth.rjust('hi', 5) - ' hi' - >>> wcwidth.rjust('\x1b[31mhi\x1b[0m', 5) - ' \x1b[31mhi\x1b[0m' - >>> wcwidth.rjust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) - ' 👨‍👩‍👧' - """ - if text.isascii() and text.isprintable(): - text_width = len(text) - else: - text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) - padding_cells = max(0, dest_width - text_width) - return fillchar * padding_cells + text - - -def center( - text: str, - dest_width: int, - fillchar: str = ' ', - *, - control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', - ambiguous_width: int = 1, -) -> str: - r""" - Return text centered in a string of given display width. - - :param text: String to center, may contain terminal sequences. - :param dest_width: Total display width of result in terminal cells. - :param fillchar: Single character for padding (default space). Must have - display width of 1 (not wide, not zero-width, not combining). Unicode - characters like ``'·'`` are acceptable. The width is not validated. - :param control_codes: How to handle control sequences when measuring. - Passed to :func:`width` for measurement. :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :returns: Text padded on both sides to reach ``dest_width``. + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts + where ambiguous characters display as double-width. See + :ref:`ambiguous_width` for details. + :returns: The width, in cells, necessary to display the character of + Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has + no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is + not printable, or has an indeterminate effect on the terminal, such as + a control character. Otherwise, the number of column positions the + character occupies on a graphic terminal (1 or 2) is returned. + + See :ref:`Specification` for details of cell measurement. + """ + ucs = ord(wc) if wc else 0 - For odd-width padding, the extra cell goes on the right (matching - Python's :meth:`str.center` behavior). + # small optimization: early return of 1 for printable ASCII, this provides + # approximately 40% performance improvement for mostly-ascii documents, with + # less than 1% impact to others. + if 32 <= ucs < 0x7f: + return 1 - .. versionadded:: 0.3.0 + # C0/C1 control characters are -1 for compatibility with POSIX-like calls + if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0: + return -1 - Example:: + # Zero width + if bisearch(ucs, _ZERO_WIDTH_TABLE): + return 0 - >>> wcwidth.center('hi', 6) - ' hi ' - >>> wcwidth.center('\x1b[31mhi\x1b[0m', 6) - ' \x1b[31mhi\x1b[0m ' - >>> wcwidth.center('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) - ' 👨‍👩‍👧 ' - """ - if text.isascii() and text.isprintable(): - text_width = len(text) - else: - text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) - total_padding = max(0, dest_width - text_width) - # matching https://jazcap53.github.io/pythons-eccentric-strcenter.html - left_pad = total_padding // 2 + (total_padding & dest_width & 1) - right_pad = total_padding - left_pad - return fillchar * left_pad + text + fillchar * right_pad + # Wide (F/W categories) + if bisearch(ucs, _WIDE_EASTASIAN_TABLE): + return 2 + + # Ambiguous width (A category) - only when ambiguous_width=2 + if ambiguous_width == 2 and bisearch(ucs, _AMBIGUOUS_TABLE): + return 2 + return 1 diff --git a/wcwidth/_width.py b/wcwidth/width.py similarity index 99% rename from wcwidth/_width.py rename to wcwidth/width.py index e9b6032..82d8f78 100644 --- a/wcwidth/_width.py +++ b/wcwidth/width.py @@ -3,9 +3,9 @@ from typing import Literal # local -from ._wcwidth import wcwidth +from .wcwidth import wcwidth from .bisearch import bisearch -from ._wcswidth import wcswidth +from .wcswidth import wcswidth from ._constants import (_EMOJI_ZWJ_SET, _ISC_VIRAMA_SET, _CATEGORY_MC_TABLE, From d983907c6f6431e74ba71a0f232a2b46d30a8cc2 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 03:01:52 -0400 Subject: [PATCH 36/70] use py38-friendly types --- bin/update-tables.py | 6 +++--- tests/test_benchmarks.py | 4 ++-- wcwidth/clip.py | 4 ++-- wcwidth/grapheme.py | 8 ++++---- wcwidth/textwrap.py | 12 ++++++------ 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/bin/update-tables.py b/bin/update-tables.py index 959af50..2e53a2e 100644 --- a/bin/update-tables.py +++ b/bin/update-tables.py @@ -25,7 +25,7 @@ from pathlib import Path from dataclasses import field, fields, dataclass -from typing import Any, Mapping, Iterable, Iterator, Sequence, Collection +from typing import Any, Mapping, Iterable, Iterator, Optional, Sequence, Collection try: from typing import Self @@ -112,7 +112,7 @@ class UnicodeVersion: major: int minor: int - micro: int | None + micro: Optional[int] @classmethod def parse(cls, version_str: str) -> UnicodeVersion: @@ -141,7 +141,7 @@ def __str__(self) -> str: class TableEntry: """An entry of a unicode table.""" - code_range: tuple[int, int] | None + code_range: Optional[tuple[int, int]] properties: tuple[str, ...] comment: str diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 03c7e86..2483a2a 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -11,7 +11,7 @@ # local import wcwidth -_width_module = sys.modules['wcwidth._width'] +_width_module = sys.modules['wcwidth.width'] def test_wcwidth_ascii(benchmark): @@ -299,7 +299,7 @@ def test_width_osc66(benchmark): benchmark(wcwidth.width, text) -def test_clip_osc66(benchmark, label, text, start, end): +def test_clip_osc66(benchmark): """Benchmark clip() with OSC 66 sequences.""" text = '\x1b]66;w=2;XY\x07\x1b]66;s=3;ABC\x07' benchmark(wcwidth.clip, text, 3, 8) diff --git a/wcwidth/clip.py b/wcwidth/clip.py index 00f0660..c1a77a8 100644 --- a/wcwidth/clip.py +++ b/wcwidth/clip.py @@ -2,7 +2,7 @@ # std imports from itertools import islice -from typing import Union, Callable, NamedTuple +from typing import Union, Callable, Optional, NamedTuple # local from .width import width @@ -145,7 +145,7 @@ def _write_cells(s: str, w: int, write_col: int) -> None: if propagate_sgr and sgr_at_clip_start is None: sgr_at_clip_start = sgr - def _append_seq(seq: str, at_col: int | None = None) -> None: + def _append_seq(seq: str, at_col: Optional[int] = None) -> None: nonlocal sgr_at_clip_start, seq_order c = col if at_col is None else at_col sequences.append((c, seq_order, seq)) diff --git a/wcwidth/grapheme.py b/wcwidth/grapheme.py index 7befc92..cdfde22 100644 --- a/wcwidth/grapheme.py +++ b/wcwidth/grapheme.py @@ -13,7 +13,7 @@ from enum import IntEnum from functools import lru_cache -from typing import TYPE_CHECKING, NamedTuple +from typing import TYPE_CHECKING, Optional, NamedTuple # local from .bisearch import bisearch as _bisearch @@ -130,7 +130,7 @@ class BreakResult(NamedTuple): @lru_cache(maxsize=1024) -def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> BreakResult | None: +def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> Optional[BreakResult]: """ Check simple GCB-pair-based break rules (cacheable). @@ -248,7 +248,7 @@ def _should_break( def iter_graphemes( unistr: str, start: int = 0, - end: int | None = None, + end: Optional[int] = None, ) -> Iterator[str]: r""" Iterate over grapheme clusters in a Unicode string. @@ -390,7 +390,7 @@ def grapheme_boundary_before(unistr: str, pos: int) -> int: def iter_graphemes_reverse( unistr: str, start: int = 0, - end: int | None = None, + end: Optional[int] = None, ) -> Iterator[str]: r""" Iterate over grapheme clusters in reverse order (last to first). diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index e0f360d..9302b15 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -12,7 +12,7 @@ import secrets import textwrap -from typing import TYPE_CHECKING, NamedTuple +from typing import TYPE_CHECKING, Optional, NamedTuple # local from .width import width as wcwidth_width @@ -36,7 +36,7 @@ class _HyperlinkState(NamedTuple): _HYPERLINK_OPEN_RE = re.compile(r'\x1b]8;([^;]*);([^\x07\x1b]*)(\x07|\x1b\\)') -def _parse_hyperlink_open(seq: str) -> _HyperlinkState | None: +def _parse_hyperlink_open(seq: str) -> Optional[_HyperlinkState]: """Parse OSC 8 open sequence, return state or None.""" if (m := _HYPERLINK_OPEN_RE.match(seq)): return _HyperlinkState(url=m.group(2), params=m.group(1), terminator=m.group(3)) @@ -241,9 +241,9 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m lines: list[str] = [] is_first_line = True - hyperlink_state: _HyperlinkState | None = None + hyperlink_state: Optional[_HyperlinkState] = None # Track the id we're using for the current hyperlink continuation - current_hyperlink_id: str | None = None + current_hyperlink_id: Optional[str] = None # Arrange in reverse order so items can be efficiently popped chunks = list(reversed(chunks)) @@ -395,7 +395,7 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m def _track_hyperlink_state( self, text: str, - state: _HyperlinkState | None) -> _HyperlinkState | None: + state: Optional[_HyperlinkState]) -> Optional[_HyperlinkState]: """ Track hyperlink state through text. @@ -545,7 +545,7 @@ def wrap(text: str, width: int = 70, *, break_long_words: bool = True, break_on_hyphens: bool = True, drop_whitespace: bool = True, - max_lines: int | None = None, + max_lines: Optional[int] = None, placeholder: str = ' [...]', propagate_sgr: bool = True) -> list[str]: r""" From 9c20f006c0c5a3b28c31da03ddce907ad5a1369d Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 03:02:29 -0400 Subject: [PATCH 37/70] py38 friendly types --- docs/unicode_version.rst | 15 +++++++++++++++ wcwidth/table_ambiguous.py | 3 +-- wcwidth/table_grapheme.py | 3 +-- wcwidth/table_mc.py | 3 +-- wcwidth/table_vs16.py | 3 +-- wcwidth/table_wide.py | 3 +-- wcwidth/table_zero.py | 3 +-- wcwidth/wcwidth.py | 3 ++- 8 files changed, 23 insertions(+), 13 deletions(-) diff --git a/docs/unicode_version.rst b/docs/unicode_version.rst index 41a1e52..38ff78d 100644 --- a/docs/unicode_version.rst +++ b/docs/unicode_version.rst @@ -16,6 +16,21 @@ release files: ``emoji-variation-sequences-12.0.0.txt`` *Date: 2019-01-15, 12:10:05 GMT* +``emoji-variation-sequences-13.0.0.txt`` + *Date: 2020-01-21, 07:15:05 GMT* + +``emoji-variation-sequences-14.0.0.txt`` + *Date: 2021-06-08, 05:19:16 GMT* + +``emoji-variation-sequences-15.0.0.txt`` + *Date: 2022-05-13, 21:54:24 GMT* + +``emoji-variation-sequences-15.1.0.txt`` + *Date: 2023-02-01, 02:22:54 GMT* + +``emoji-variation-sequences-16.0.0.txt`` + *Date: 2024-05-01, 21:25:24 GMT* + ``emoji-variation-sequences-17.0.0.txt`` *Date: 2025-01-30, 21:48:29 GMT* diff --git a/wcwidth/table_ambiguous.py b/wcwidth/table_ambiguous.py index 2c40498..8740b09 100644 --- a/wcwidth/table_ambiguous.py +++ b/wcwidth/table_ambiguous.py @@ -1,9 +1,8 @@ """ Exports AMBIGUOUS_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-30 05:21:17 UTC. """ - # pylint: disable=duplicate-code AMBIGUOUS_EASTASIAN = { '17.0.0': ( diff --git a/wcwidth/table_grapheme.py b/wcwidth/table_grapheme.py index d265b66..86ee8f7 100644 --- a/wcwidth/table_grapheme.py +++ b/wcwidth/table_grapheme.py @@ -4,9 +4,8 @@ This module provides lookup tables for Unicode grapheme cluster break properties as defined in UAX #29: Unicode Text Segmentation. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-30 05:21:17 UTC. """ - # pylint: disable=duplicate-code GRAPHEME_CR = ( diff --git a/wcwidth/table_mc.py b/wcwidth/table_mc.py index 63acce9..5dd39d4 100644 --- a/wcwidth/table_mc.py +++ b/wcwidth/table_mc.py @@ -1,9 +1,8 @@ """ Exports CATEGORY_MC table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-30 05:21:17 UTC. """ - # pylint: disable=duplicate-code CATEGORY_MC = { '17.0.0': ( diff --git a/wcwidth/table_vs16.py b/wcwidth/table_vs16.py index a5fc0a8..cd34923 100644 --- a/wcwidth/table_vs16.py +++ b/wcwidth/table_vs16.py @@ -1,9 +1,8 @@ """ Exports VS16_NARROW_TO_WIDE table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 15:55:22 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-30 05:21:17 UTC. """ - # pylint: disable=duplicate-code VS16_NARROW_TO_WIDE = { '9.0.0': ( diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py index 9d2ebd5..5fcbe92 100644 --- a/wcwidth/table_wide.py +++ b/wcwidth/table_wide.py @@ -1,9 +1,8 @@ """ Exports WIDE_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-30 05:21:17 UTC. """ - # pylint: disable=duplicate-code WIDE_EASTASIAN = { '17.0.0': ( diff --git a/wcwidth/table_zero.py b/wcwidth/table_zero.py index b669f70..dac9693 100644 --- a/wcwidth/table_zero.py +++ b/wcwidth/table_zero.py @@ -1,9 +1,8 @@ """ Exports ZERO_WIDTH table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-04-29 16:45:27 UTC. +This code generated by wcwidth/bin/update-tables.py on 2026-04-30 05:21:17 UTC. """ - # pylint: disable=duplicate-code ZERO_WIDTH = { '17.0.0': ( diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 4d25cb3..0695ee1 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -64,7 +64,7 @@ # std imports from functools import lru_cache -from typing import Callable, Union, Literal, NamedTuple +from typing import Union, Literal, Callable, NamedTuple # local # pylint: disable=unused-import @@ -633,6 +633,7 @@ def _text_sizing_clip( if ts.params.width > 0: # Fixed-width mode: explicit count at `scale` cells each. # Use itertools.islice to avoid materializing the full grapheme list. + # std imports from itertools import islice for j, g in enumerate(islice(iter_graphemes(ts.text), ts.params.width)): units.append((g, scale)) From addf07715d479ebfd6241ad5e055f66b1dd733ec Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 03:29:21 -0400 Subject: [PATCH 38/70] more refactor, remove text sizing for PR! --- docs/api.rst | 4 - docs/intro.rst | 2 - requirements-tests38.in | 1 - requirements-tests39.in | 1 - tests/test_benchmarks.py | 12 - tests/test_text_sizing.py | 279 ---------------------- tests/test_textwrap.py | 33 --- wcwidth/__init__.py | 3 +- wcwidth/clip.py | 135 +---------- wcwidth/escape_sequences.py | 20 +- wcwidth/text_sizing.py | 196 ---------------- wcwidth/wcwidth.py | 454 +----------------------------------- wcwidth/width.py | 15 +- 13 files changed, 10 insertions(+), 1145 deletions(-) delete mode 100644 tests/test_text_sizing.py delete mode 100644 wcwidth/text_sizing.py diff --git a/docs/api.rst b/docs/api.rst index a80eb40..55d288b 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -36,8 +36,4 @@ requirements.txt or equivalent. Their signatures will never change. .. autofunction:: wcwidth.list_versions -.. autofunction:: wcwidth.TextSizing - -.. autofunction:: wcwidth.TextSizingParams - .. _SEMVER: https://semver.org diff --git a/docs/intro.rst b/docs/intro.rst index 3736f42..4c05214 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -470,8 +470,6 @@ History 0.7.0 *2026-04-30* * **Improved** `clip()` to support backward cursor sequence overwrite, "Painter's algorithm". - * **Improved** `width()` and `clip()` to support parsing of `kitty text - sizing protocol`_ (OSC 66). 0.6.0 *2026-02-06* * **New** Parameters ``expand_tabs``, ``replace_whitespace``, ``fix_sentence_endings``, diff --git a/requirements-tests38.in b/requirements-tests38.in index 5158715..ea2794e 100644 --- a/requirements-tests38.in +++ b/requirements-tests38.in @@ -3,4 +3,3 @@ pytest<7 pytest-cov coverage[toml]<6 packaging<26 -# pytest-benchmark<5 diff --git a/requirements-tests39.in b/requirements-tests39.in index 6714226..2a14182 100644 --- a/requirements-tests39.in +++ b/requirements-tests39.in @@ -8,4 +8,3 @@ tomli<2.3.0 cffi<2 pygments<2.20 zipp<3.23.1 -# pytest-benchmark diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 2483a2a..813e934 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -293,18 +293,6 @@ def test_clip_complex_sgr(benchmark): benchmark(wcwidth.clip, text, 6, 11) -def test_width_osc66(benchmark): - """Benchmark width() with OSC 66 sequences.""" - text = '\x1b]66;w=2;XY\x07\x1b]66;s=3;ABC\x07' - benchmark(wcwidth.width, text) - - -def test_clip_osc66(benchmark): - """Benchmark clip() with OSC 66 sequences.""" - text = '\x1b]66;w=2;XY\x07\x1b]66;s=3;ABC\x07' - benchmark(wcwidth.clip, text, 3, 8) - - def test_propagate_sgr_multiline(benchmark): """Benchmark propagate_sgr() with multiple lines.""" lines = ['\x1b[1;31mline one', 'line two', 'line three\x1b[0m'] diff --git a/tests/test_text_sizing.py b/tests/test_text_sizing.py deleted file mode 100644 index ff634ea..0000000 --- a/tests/test_text_sizing.py +++ /dev/null @@ -1,279 +0,0 @@ -"""Tests for Text Sizing Protocol (OSC 66) support.""" - -# 3rd party -import pytest - -# local -from wcwidth import (TextSizing, - TextSizingParams, - clip, - width, - wcswidth, - iter_sequences, - strip_sequences) -from wcwidth.text_sizing import TEXT_FIELD_MAPPING -from wcwidth.escape_sequences import TEXT_SIZING_PATTERN - -_W_HI = TEXT_FIELD_MAPPING['w'].high -_N_HI = TEXT_FIELD_MAPPING['n'].high -_D_HI = TEXT_FIELD_MAPPING['d'].high - -CONTROL_CODES_PARAMS_CASES = [ - ('x=2', "", "Unknown text sizing field 'x' in "), - ('s=3:x=3', "s=3", "Unknown text sizing field 'x' in "), - ('s=2:x=3:w=9', f"s=2:w={_W_HI}", "Unknown text sizing field 'x' in "), - ('xyz=2', "", "Unknown text sizing field 'xyz' in "), - ('xxx', "", "Expected '=' in text sizing parameter"), - ('s=xxx', "", "Illegal text sizing value 'xxx' in "), - ('s=-99', "", "Out of bounds text sizing value '-99' in "), - ('s=99', f"s={_W_HI}", "Out of bounds text sizing value '99' in "), - ('w=-1', "", "Out of bounds text sizing value '-1' in "), - ('w=8', f"w={_W_HI}", "Out of bounds text sizing value '8' in "), - ('n=20', f"n={_N_HI}", "Out of bounds text sizing value '20' in "), - ('d=99', f"d={_D_HI}", "Out of bounds text sizing value '99' in "), - ('v=5', "v=2", "Out of bounds text sizing value '5' in "), - ('h=3', "h=2", "Out of bounds text sizing value '3' in "), -] - - -@pytest.mark.parametrize('given_params,expected_remainder,expected_exc,', CONTROL_CODES_PARAMS_CASES) -def test_text_sizing_params_control_codes(given_params, expected_remainder, expected_exc): - """Verify control_codes='strict' and 'parse' behavior in TextSizingParams.from_params().""" - # assert control_codes='strict' raises expected exception, - with pytest.raises(ValueError) as exc_info: - TextSizingParams.from_params(given_params, control_codes='strict') - assert exc_info.value.args[0].startswith(expected_exc) - - # when 'parse' (default), any illegal argument or value is filtered, excluded, or clipped - params = TextSizingParams.from_params(given_params) - assert params.make_sequence() == expected_remainder - - -@pytest.mark.parametrize('given_params,expected_remainder,expected_exc,', CONTROL_CODES_PARAMS_CASES) -def test_text_sizing_width_control_codes(given_params, expected_remainder, expected_exc): - """Verify control_codes='strict' with invalid OSC 66 sequences in wciwdth.width().""" - seq1 = '\x1b]66;' + given_params + ';ABC' + '\x07' - seq2 = '\x1b]66;' + given_params + ';ABC' + '\x1b\\' - for seq in (seq1, seq2): - with pytest.raises(ValueError) as exc_info: - width(seq, control_codes='strict') - assert exc_info.value.args[0].startswith(expected_exc) - - -@pytest.mark.parametrize('params,text,expected_width', [ - # cases of static width=N values, - (TextSizingParams(scale=2, width=1), 'climclam', 2), - (TextSizingParams(scale=2, width=3), 'anything', 6), - (TextSizingParams(scale=1, width=5), '', 5), - (TextSizingParams(scale=3, width=1), 'x', 3), - # and automatic width (width=0) values, - (TextSizingParams(), '', 0), - (TextSizingParams(), 'AB', 2), - (TextSizingParams(), '中', 2), - (TextSizingParams(scale=2), 'AB', 4), - (TextSizingParams(scale=2), '中', 4), - (TextSizingParams(scale=3), '', 0), - (TextSizingParams(scale=7, width=7, numerator=15, denominator=15, - vertical_align=2, horizontal_align=2), 'x!yzzy', 49), -]) -def test_text_sizing_width(params, text, expected_width): - """Verify width using with both kinds of terminator.""" - # verify internal TextSizing.display_width() result, - assert TextSizing(params, text, terminator='\x07').display_width() == expected_width - assert TextSizing(params, text, terminator='\x1b\\').display_width() == expected_width - seq1 = TextSizing(params, text, terminator='\x07').make_sequence() - seq2 = TextSizing(params, text, terminator='\x1b\\').make_sequence() - - # verify round-trip - ts_match1, ts_match2 = TEXT_SIZING_PATTERN.match(seq1), TEXT_SIZING_PATTERN.match(seq2) - assert ts_match1 and ts_match2 - assert TextSizing.from_match(ts_match1) == TextSizing(params, text, terminator='\x07') - assert TextSizing.from_match(ts_match2) == TextSizing(params, text, terminator='\x1b\\') - - # and external width(), - assert width(seq1) == expected_width - assert width(seq2) == expected_width - - # verify 'strict' does not raise ValueError - width(seq1, control_codes='strict') - width(seq2, control_codes='strict') - - # and verify 'ignore' measures only inner_text (does not parse scale or width) - assert width(seq1, control_codes='ignore') == wcswidth(text) - assert width(seq2, control_codes='ignore') == wcswidth(text) - - -@pytest.mark.parametrize('given_sequence,expected_text,expected_params,expected_width', [ - ('\x1b]66;s=2:w=2;AB\x07', 'AB', 's=2:w=2', 4), - ('\x1b]66;s=2:w=2;\u4e2d\x07', '\u4e2d', 's=2:w=2', 4), - ('\x1b]66;s=3:w=1;x\x07', 'x', 's=3:w=1', 3), - ('\x1b]66;w=5;hello\x07', 'hello', 'w=5', 5), - ('\x1b]66;s=2:w=3;anything\x07', 'anything', 's=2:w=3', 6), - ('\x1b]66;w=3;x\x07', 'x', 'w=3', 3), - ('\x1b]66;s=1;AB\x07', 'AB', '', 2), - ('\x1b]66;s=2;AB\x07', 'AB', 's=2', 4), - ('\x1b]66;s=2;中\x07', '中', 's=2', 4), - ('\x1b]66;s=2;\x07', '', 's=2', 0), - ('\x1b]66;s=1:w=1;\x07', '', 'w=1', 1), - ('\x1b]66;w=2;A\x07', 'A', 'w=2', 2), - ('\x1b]66;s=2:w=3;text\x1b\\', 'text', 's=2:w=3', 6), -]) -def test_text_sizing_sequence(given_sequence, expected_text, expected_params, expected_width): - """Verify parsing and measured width of raw OSC 66 sequence.""" - ts_match = TEXT_SIZING_PATTERN.match(given_sequence) - assert ts_match is not None - text_size = TextSizing.from_match(ts_match) - assert text_size.params.make_sequence() == expected_params - assert text_size.text == expected_text - assert width(given_sequence, control_codes='parse') == expected_width - assert width(given_sequence, control_codes='strict') == expected_width - assert width(given_sequence, control_codes='ignore') == wcswidth(expected_text) - - -@pytest.mark.parametrize('text,expected', [ - ('\x1b]66;s=2:w=3:n=1:d=2:v=1:h=2;x!yzzy\x1b\\', 6), - ('\x1b]66;s=2:w=3;anything\x07', 6), - ('\x1b]66;w=3;x\x07', 3), - ('\x1b]66;s=1:w=0;AB\x07', 2), - ('\x1b]66;s=2:w=0;AB\x07', 4), - ('\x1b]66;s=2:w=0;\u4e2d\x07', 4), # '中' - ('\x1b]66;s=1:w=0;\x07', 0), - ('abc\x1b]66;w=3;x\x07def', 9), - ('\x1b]66;w=2;A\x07\x1b]66;w=3;B\x07', 5), - ('\x1b]66;s=2:w=3;text\x1b\\', 6), - ('\x1b[31m\x1b]66;w=2;AB\x07\x1b[0m', 2), -]) -def test_strings_with_text_sizing(text, expected): - """Verify measured width strings containing OSC66.""" - assert width(text) == expected - assert width(text, control_codes='strict') == expected - - -@pytest.mark.parametrize('text,expected', [ - ('\x1b]66;s=2;hello\x07', 'hello'), - ('\x1b]66;s=2;hello\x1b\\', 'hello'), - ('\x1b]66;;text\x07', 'text'), - ('\x1b]66;s=3:w=2;\x07', ''), - ('abc\x1b]66;w=2;XY\x07def', 'abcXYdef'), - ('\x1b[31m\x1b]66;s=2;red\x07\x1b[0m', 'red'), - ('\x1b]66;w=1;A\x07\x1b]66;w=1;B\x07', 'AB'), -]) -def test_strip_strings_with_text_sizing(text, expected): - assert strip_sequences(text) == expected - - -@pytest.mark.parametrize('text,expected_segs', [ - ('abc\x1b]66;s=2;hello\x07def', [('abc', False), ('\x1b]66;s=2;hello\x07', True), ('def', False)]), - ('abc\x1b]66;s=2;n=1,d=2,w=3;hello\x1b\\def', [('abc', False), ('\x1b]66;s=2;n=1,d=2,w=3;hello\x1b\\', True), ('def', False)]), -]) -def test_iter_sequences_text_sizing(text, expected_segs): - assert list(iter_sequences(text)) == expected_segs - - -@pytest.mark.parametrize('text,start,end,expected', [ - ('\x1b]66;w=3;ABC\x07', 0, 3, '\x1b]66;w=3;ABC\x07'), - ('\x1b]66;w=3;ABC\x07', 0, 2, '\x1b]66;w=2;AB\x07'), - ('\x1b]66;w=3;ABC\x07', 1, 3, '\x1b]66;w=2;BC\x07'), - ('ab\x1b]66;w=2;XY\x07cd', 0, 6, 'ab\x1b]66;w=2;XY\x07cd'), - ('ab\x1b]66;w=2;XY\x07cd', 0, 3, 'ab\x1b]66;w=1;X\x07'), - ('ab\x1b]66;w=2;XY\x07cd', 3, 6, '\x1b]66;w=1;Y\x07cd'), - ('ab\x1b]66;w=2;XY\x07cd', 4, 6, 'cd'), -]) -def test_clip_text_sizing_basic(text, start, end, expected): - """Test basic support of clip() with text sizing sequence.""" - assert repr(clip(text, start, end)) == repr(expected) - - -@pytest.mark.parametrize('text,start,end,expected', [ - ('\x1b]66;s=2;ABC\x07', 0, 0, ''), - ('\x1b]66;s=2;ABC\x07', 6, 6, ''), - ('\x1b]66;s=2;ABC\x07', 0, 2, '\x1b]66;s=2;A\x07'), - ('\x1b]66;s=2;ABC\x07', 0, 4, '\x1b]66;s=2;AB\x07'), - ('\x1b]66;s=2;ABC\x07', 0, 6, '\x1b]66;s=2;ABC\x07'), - ('\x1b]66;s=2;ABC\x07', 2, 6, '\x1b]66;s=2;BC\x07'), - ('\x1b]66;s=2;ABC\x07', 4, 6, '\x1b]66;s=2;C\x07'), -]) -def test_clip_text_sizing_scaled(text, start, end, expected): - """Test support of clip() with scale=N arguments.""" - assert repr(clip(text, start, end)) == repr(expected) - - -@pytest.mark.parametrize('text,start,end,expected', [ - # a b c - # === === === - # 012 345 678 - # . - # .. - # *a* - # *a* . - # ... *b* - # ... *b* . - # ... *b* .. - # ... *b* *c* - ('\x1b]66;s=3;ABC\x07', 0, 0, ''), - ('\x1b]66;s=3;ABC\x07', 0, 1, '.'), - ('\x1b]66;s=3;ABC\x07', 0, 2, '..'), - ('\x1b]66;s=3;ABC\x07', 0, 3, '\x1b]66;s=3;A\x07'), - ('\x1b]66;s=3;ABC\x07', 0, 4, '\x1b]66;s=3;A\x07.'), - ('\x1b]66;s=3;ABC\x07', 0, 5, '\x1b]66;s=3;A\x07..'), - ('\x1b]66;s=3;ABC\x07', 0, 6, '\x1b]66;s=3;AB\x07'), - ('\x1b]66;s=3;ABC\x07', 0, 7, '\x1b]66;s=3;AB\x07.'), - ('\x1b]66;s=3;ABC\x07', 0, 8, '\x1b]66;s=3;AB\x07..'), - ('\x1b]66;s=3;ABC\x07', 0, 9, '\x1b]66;s=3;ABC\x07'), - ('\x1b]66;s=3;ABC\x07', 0, 10, '\x1b]66;s=3;ABC\x07'), - # a b - # === === === - # 012 345 678 - # . 1, 2 - # .. 1, 3 - # .. . 1, 4 - # .. .. 1, 5 - # .. *b* 1, 6 - # .. *b* . 1, 7 - # .. *b* .. 1, 8 - # .. *b* *c* 1, 9 - ('\x1b]66;s=3;ABC\x07', 1, 1, ''), - ('\x1b]66;s=3;ABC\x07', 1, 2, '.'), - ('\x1b]66;s=3;ABC\x07', 1, 3, '..'), - ('\x1b]66;s=3;ABC\x07', 1, 4, '...'), - ('\x1b]66;s=3;ABC\x07', 1, 5, '....'), - ('\x1b]66;s=3;ABC\x07', 1, 6, '..\x1b]66;s=3;B\x07'), - ('\x1b]66;s=3;ABC\x07', 1, 7, '..\x1b]66;s=3;B\x07.'), - ('\x1b]66;s=3;ABC\x07', 1, 8, '..\x1b]66;s=3;B\x07..'), - ('\x1b]66;s=3;ABC\x07', 1, 9, '..\x1b]66;s=3;BC\x07'), - ('\x1b]66;s=3;ABC\x07', 1, 10, '..\x1b]66;s=3;BC\x07'), - # two-thirds of string 'A' and half of string 'B' is fillchar - # ('\x1b]66;s=3;ABC\x07', 2, 4, '..'), - # half of string 'A' and all of string 'B' - # a b - # === === === - # 012 345 678 - # . 2, 3 - # . . 2, 4 - # . .. 2, 5 - # . *b* 2, 6 - # . *b* . 2, 7 - # . *b* .. 2, 8 - # . *b* *c* 2, 9 - ('\x1b]66;s=3;ABC\x07', 2, 2, ''), - ('\x1b]66;s=3;ABC\x07', 2, 3, '.'), - ('\x1b]66;s=3;ABC\x07', 2, 4, '..'), - ('\x1b]66;s=3;ABC\x07', 2, 5, '...'), - ('\x1b]66;s=3;ABC\x07', 2, 6, '.\x1b]66;s=3;B\x07'), - ('\x1b]66;s=3;ABC\x07', 2, 7, '.\x1b]66;s=3;B\x07.'), - ('\x1b]66;s=3;ABC\x07', 2, 8, '.\x1b]66;s=3;B\x07..'), - ('\x1b]66;s=3;ABC\x07', 2, 9, '.\x1b]66;s=3;BC\x07'), - ('\x1b]66;s=3;ABC\x07', 2, 10, '.\x1b]66;s=3;BC\x07'), - # and now 3:10, should be easy ... - ('\x1b]66;s=3;ABC\x07', 3, 3, ''), - ('\x1b]66;s=3;ABC\x07', 3, 4, '.'), - ('\x1b]66;s=3;ABC\x07', 3, 5, '..'), - ('\x1b]66;s=3;ABC\x07', 3, 6, '\x1b]66;s=3;B\x07'), - ('\x1b]66;s=3;ABC\x07', 3, 7, '\x1b]66;s=3;B\x07.'), - ('\x1b]66;s=3;ABC\x07', 3, 8, '\x1b]66;s=3;B\x07..'), - ('\x1b]66;s=3;ABC\x07', 3, 9, '\x1b]66;s=3;BC\x07'), - ('\x1b]66;s=3;ABC\x07', 3, 10, '\x1b]66;s=3;BC\x07'), -]) -def test_clip_text_sizing_scaled_with_fillchar(text, start, end, expected): - """Test support of clip() with scale=N and fillchar is needed to fill remainder.""" - assert repr(clip(text, start, end, fillchar='.')) == repr(expected) diff --git a/tests/test_textwrap.py b/tests/test_textwrap.py index b502c73..ae2eb2d 100644 --- a/tests/test_textwrap.py +++ b/tests/test_textwrap.py @@ -485,36 +485,3 @@ def test_wrap_replace_whitespace_false_newlines_zero_width(): """Newlines have zero display width, so more text fits per line than stdlib.""" assert wrap('hello\nworld foo\nbar', 10, replace_whitespace=False) == [ 'hello\nworld', 'foo\nbar'] - - -# kitty text sizing protocol (OSC 66) constants for wrap() tests. -# Width calculation, BEL/ST/scale/auto-width/CJK/SGR interaction with OSC66 are -# already covered exhaustively in test_text_sizing.py and test_clip_*.py. These -# tests verify only the *line-breaking* behaviour that is unique to wrap(). -TS3 = '\x1b]66;w=3;XYZ\x07' # explicit width=3 - - -@pytest.mark.parametrize('text,w,expected', [ - # Greedy fill: atomic sequence moves to next line when line width exceeded - ('abc' + TS3 + 'def', 4, ['abc' + TS3 + 'd', 'ef']), - ('abc' + TS3 + 'def', 5, ['abc' + TS3 + 'de', 'f']), - ('abc' + TS3 + 'def', 6, ['abc', TS3 + 'def']), - ('abc' + TS3 + 'def', 8, ['abc', TS3 + 'def']), - ('abc' + TS3 + 'def', 10, ['abc' + TS3 + 'def']), - # Sequence stays with preceding word when total stripped width fits - ('aa' + TS3 + 'bb', 5, ['aa', TS3 + 'bb']), - ('pre' + TS3 + 'post', 8, ['pre', TS3 + 'post']), -]) -def test_wrap_ts_line_fill(text, w, expected): - """OSC 66 sequence width is respected and treated as atomic unit when filling lines.""" - assert wrap(text, w) == expected - - -@pytest.mark.parametrize('text,w,expected', [ - # max_lines truncation preserves OSC66 sequence atomically with truncated text - ('abc' + TS3 + 'def', 7, ['abc', TS3 + 'def']), - ('ab' + TS3 + 'cd', 6, ['ab', TS3 + 'cd']), -]) -def test_wrap_ts_max_lines(text, w, expected): - """max_lines truncation works correctly with OSC 66 sequences.""" - assert wrap(text, w, max_lines=2, placeholder='~') == expected diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 175c1f7..5020d2c 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -21,7 +21,6 @@ from .table_vs16 import VS16_NARROW_TO_WIDE from .table_wide import WIDE_EASTASIAN from .table_zero import ZERO_WIDTH -from .text_sizing import TextSizing, TextSizingParams from .table_ambiguous import AMBIGUOUS_EASTASIAN from .escape_sequences import iter_sequences, strip_sequences from .unicode_versions import list_versions @@ -31,7 +30,7 @@ __all__ = ('wcwidth', 'wcswidth', 'width', 'iter_sequences', 'iter_graphemes', 'iter_graphemes_reverse', 'grapheme_boundary_before', 'ljust', 'rjust', 'center', 'wrap', 'clip', 'strip_sequences', - 'list_versions', 'propagate_sgr', 'TextSizing', 'TextSizingParams') + 'list_versions', 'propagate_sgr') # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places diff --git a/wcwidth/clip.py b/wcwidth/clip.py index c1a77a8..97c735e 100644 --- a/wcwidth/clip.py +++ b/wcwidth/clip.py @@ -1,8 +1,7 @@ """This is a python implementation of clip().""" # std imports -from itertools import islice -from typing import Union, Callable, Optional, NamedTuple +from typing import Optional, NamedTuple # local from .width import width @@ -11,7 +10,6 @@ _sgr_state_update, _sgr_state_is_active, _sgr_state_to_sequence) -from .text_sizing import TextSizing, TextSizingParams from .escape_sequences import _SEQUENCE_CLASSIFY @@ -29,9 +27,6 @@ class SeqToken(NamedTuple): text: str -Token = Union[VisToken, SeqToken] - - def clip( text: str, start: int, @@ -85,9 +80,6 @@ def clip( .. versionchanged:: 0.5.0 Added ``propagate_sgr`` parameter (default True). - .. versionchanged:: 0.6.1 - Parses OSC 66 Sequences. - Example:: >>> clip('hello world', 0, 5) @@ -192,25 +184,7 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: idx = m.end() continue - # 1c. OSC 66 Text Sizing - if (ts_meta := m.group('ts_meta')) is not None: - ts_text = m.group('ts_text') - ts_term = m.group('ts_term') - col = _text_sizing_clip( - TextSizing( - TextSizingParams.from_params(ts_meta), - ts_text, - ts_term), - col=col, start=start, end=end, - write_cells=_write_cells, - fillchar=fillchar, ambiguous_width=ambiguous_width, - ) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - idx = m.end() - continue - - # 1d. Any other recognized zero-width sequence + # 1c. Any other recognized zero-width sequence _append_seq(m.group()) idx = m.end() continue @@ -310,108 +284,3 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: result += '\x1b[0m' return result - - -def _text_sizing_clip( - ts: TextSizing, - *, - col: int, - start: int, - end: int, - write_cells: Callable[[str, int, int], None], - fillchar: str = ' ', - ambiguous_width: int = 1, -) -> int: - """ - Emit tokens for a text-sizing (OSC 66) sequence, clipped to ``[start, end)``. - - Returns ``new_col`` (column position after the sequence). - """ - # pylint: disable=too-many-locals,too-many-branches,too-complex - ts_width = ts.display_width(ambiguous_width) - - # Sequence fully visible or fully outside: simple cases - if col >= start and col + ts_width <= end: - write_cells(ts.make_sequence(), ts_width, col) - return col + ts_width - if col >= end or col + ts_width <= start: - return col + ts_width - - # Partial overlap: the sequence straddles a clip boundary. - # Decompose into unit cells (each grapheme occupies `scale` cells), - # emit as many whole units as fit inside [start, end), filling the - # remainder with `fillchar`. - rel_start = max(0, start - col) - rel_end = min(end, col + ts_width) - col - scale = ts.params.scale - - # Build the list of (grapheme, cell_width) units - units: list[tuple[str, int]] = [] - if ts.params.width > 0: - # Fixed-width mode: explicit count at `scale` cells each. - # Use itertools.islice to avoid materializing the full grapheme list. - # std imports - for _, g in enumerate(islice(iter_graphemes(ts.text), ts.params.width)): - units.append((g, scale)) - # Pad with empty graphemes if text had fewer than width - for _ in range(ts.params.width - len(units)): - units.append(('', scale)) - else: - # Auto-width mode: grapheme count derived from content, width varies - for g in iter_graphemes(ts.text): - units.append((g, width(g, ambiguous_width=ambiguous_width) * scale)) - - # Batch of consecutive fully-visible units that can be emitted as a - # single text-sizing sequence. - pending_units: list[tuple[str, int]] = [] # (grapheme_text, cell_width) - - def flush(flush_col: int) -> None: - """Emit accumulated graphemes as one text-sizing sequence.""" - if not pending_units: - return - texts = [u[0] for u in pending_units] - total_w = sum(u[1] for u in pending_units) - params = TextSizingParams( - scale, - len(texts) if ts.params.width > 0 else 0, - ts.params.numerator, - ts.params.denominator, - ts.params.vertical_align, - ts.params.horizontal_align) - write_cells( - TextSizing(params, ''.join(texts), ts.terminator).make_sequence(), - total_w, - flush_col) - pending_units.clear() - - # Walk units in cell-coordinate space, collecting consecutive fully-visible - # ones into a batch (flushed as one sequence) and emitting fillchars for - # partial units at the boundaries. - flush_col_pos = col + rel_start - unit_pos = 0 # current position in cell-coordinates within the sequence - for unit_text, unit_w in units: - unit_end = unit_pos + unit_w - if unit_end <= rel_start: - # Unit is entirely before the clip window - unit_pos = unit_end - continue - if unit_pos >= rel_end: - # Unit is entirely past the clip window - break - - overlap = min(unit_end, rel_end) - max(unit_pos, rel_start) - if overlap == unit_w and unit_w > 0: - # Unit fits completely — batch it with others - if not pending_units: - flush_col_pos = col + max(unit_pos, rel_start) - pending_units.append((unit_text, unit_w)) - else: - # Unit is partially clipped — flush batch, emit fillchars for remainder - flush(flush_col_pos) - abs_start = col + max(unit_pos, rel_start) - for i in range(overlap): - write_cells(fillchar, 1, abs_start + i) - unit_pos = unit_end - - flush(flush_col_pos) - return col + ts_width diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index acd7e8e..806fdc1 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -14,20 +14,12 @@ # local from .sgr_state import _SGR_PATTERN -# Text Sizing Protocol (OSC 66) — has positive width, must be checked before ZERO_WIDTH_PATTERN. -# Groups: (1) metadata, (2) inner text, (3) terminator (BEL or ST). -# https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ -TEXT_SIZING_PATTERN = re.compile( - r'\x1b\]66;([^;\x07\x1b]*);([^\x07\x1b]*)(\x07|\x1b\\)' -) - # Zero-width escape sequences (SGR, OSC, CSI, etc.). This table, like INDETERMINATE_EFFECT_SEQUENCE, # originated from the 'blessed' library. ZERO_WIDTH_PATTERN = re.compile( # CSI sequences r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]|' - # OSC sequences, note that text sizing protocol (OSC 66) is special case in width() and clip(), - # and contrary to the variable name, it is positive width. + # OSC sequences r'\x1b\][^\x07\x1b]*(?:\x07|\x1b\\)|' # APC sequences r'\x1b_[^\x1b\x07]*(?:\x07|\x1b\\)|' @@ -56,8 +48,6 @@ _SGR_PATTERN.pattern.replace('(', '(?P', 1) + '|' + CURSOR_RIGHT_SEQUENCE.pattern.replace('(', '(?P', 1) + '|' + CURSOR_LEFT_SEQUENCE.pattern.replace('(', '(?P', 1) - + '|' + - r'\x1b\]66;(?P[^;\x07\x1b]*);(?P[^\x07\x1b]*)(?P\x07|\x1b\\)' + '|' + r'(?P(?:' + ZERO_WIDTH_PATTERN.pattern + '))' ) @@ -151,7 +141,7 @@ def strip_sequences(text: str) -> str: r""" Return text with all terminal escape sequences removed. - For sequences containing printable text, OSC 66 (Text sizing protocol) and OSC 8 (hyperlink), + For sequences containing printable text, such as OSC 8 (hyperlink), the inner text is preserved. Unknown or incomplete ESC sequences are preserved. @@ -169,11 +159,7 @@ def strip_sequences(text: str) -> str: 'hello' >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text') 'bold red text' - >>> strip_sequences('\x1b]66;s=2;hello\x07') - 'hello' >>> strip_sequences('\x1b]8;id=34;https://example.com\x1b\\[view]\x1b]8;;\x1b\\') '[view]' """ - if '\x1b]66;' in text: - text = TEXT_SIZING_PATTERN.sub(r'\2', text) - return ZERO_WIDTH_PATTERN.sub('', text) + return ZERO_WIDTH_PATTERN.sub('', text) \ No newline at end of file diff --git a/wcwidth/text_sizing.py b/wcwidth/text_sizing.py deleted file mode 100644 index 5a930a3..0000000 --- a/wcwidth/text_sizing.py +++ /dev/null @@ -1,196 +0,0 @@ -r""" -`kitty text sizing protocol`_ (OSC 66) parsing and measurement. - -The kitty text sizing protocol allows terminal apps to explicitly tell -terminals how many cells text occupies, using the escape sequence:: - - ESC ] 66 ; metadata ; text BEL/ST - -Metadata is colon-separated ``key=value`` pairs: - -- ``s``: scale -- ``w``: width in cells -- ``n``: fractional numerator -- ``d``: fractional denominator -- ``v``: vertical alignment -- ``h``: horizontal alignment - -Parsing is pretty straight-forward: - -- When ``w > 0``, return ``s * w``. -- Otherwise ``w == 0``, ``s * wcswidth(inner_text_width)`` cells. - -Numerator, denominator, and alignment codes and values are parsed but otherwise ignored -and have no effect on measurements made in this library. - -.. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ - -.. versionadded:: 0.7.0 -""" - -from __future__ import annotations - -# std imports -import re - -import typing - -# local -from .wcswidth import wcswidth - - -class _FieldMeta(typing.NamedTuple): - name: str - low: int - high: int - default: int - - -TEXT_FIELD_MAPPING: dict[str, _FieldMeta] = { - 's': _FieldMeta(name='scale', low=1, high=7, default=1), - 'w': _FieldMeta(name='width', low=0, high=7, default=0), - 'n': _FieldMeta(name='numerator', low=0, high=15, default=0), - 'd': _FieldMeta(name='denominator', low=0, high=15, default=0), - 'v': _FieldMeta(name='vertical_align', low=0, high=2, default=0), - 'h': _FieldMeta(name='horizontal_align', low=0, high=2, default=0)} - - -class TextSizingParams(typing.NamedTuple): - """ - Parsed parameters from a text sizing escape sequence (OSC 66). - - :param scale: Scale factor (1-7). Text occupies ``scale`` rows tall and ``scale * width`` - columns wide. - :param width: Width in cells (0-7). When 0, width is auto-calculated from the inner text. - :param numerator: Fractional scaling numerator (0-15). - :param denominator: Fractional scaling denominator (0-15). - :param vertical_align: Vertical alignment (0=top, 1=bottom, 2=center). - :param horizontal_align: Horizontal alignment (0=left, 1=right, 2=center). - """ - - scale: int = 1 - width: int = 0 - numerator: int = 0 - denominator: int = 0 - vertical_align: int = 0 - horizontal_align: int = 0 - - def __repr__(self) -> str: - """ - Return a compact representation including only non-default fields. - - This avoids verbose output when most fields are defaults. - """ - # modified to show values only when non-default - repr_fmt = ', '.join(f'{field.name}={getattr(self, field.name)}' - for field in TEXT_FIELD_MAPPING.values() - if getattr(self, field.name) != field.default) - return f'{self.__class__.__name__}({repr_fmt})' - - def make_sequence(self) -> str: - """Build and return sub-part of an OSC 66 sequence.""" - parts = [] - # build string for all known parameters of non-default values - for field_key, field in TEXT_FIELD_MAPPING.items(): - if (val := getattr(self, field.name)) != field.default: - parts.append(f'{field_key}={val}') - return ':'.join(parts) - - @classmethod - def from_params(cls, raw: str, control_codes: str = 'parse') -> TextSizingParams: - """ - Parse colon-separated ``key=value`` metadata string. - - :param raw: Metadata string, e.g. ``'s=2:w=3'``. - :param control_codes: 'parse' or 'strict'. - :raises ValueError: If ``control_codes='strict'`` unrecognized text sizing parameters raise - ValueError. - :returns: Parsed parameters with values clamped to valid ranges. - Unknown keys are ignored. Non-integer values use defaults. - - Example:: - - >>> TextSizingParams.from_params('s=2:w=3') - TextSizingParams(scale=2, width=3, numerator=0, denominator=0, \ - vertical_align=0, horizontal_align=0) - """ - kwargs: typing.Dict[str, int] = {} - if not raw: - return cls() - for part in raw.split(':'): - if '=' not in part: - if control_codes == 'strict': - raise ValueError(f"Expected '=' in text sizing parameter (key=val), " - f"got {part!r} in OSC 66 sequence, {raw!r}") - continue - key, _eq, val = part.partition('=') - field = TEXT_FIELD_MAPPING.get(key) - if field is None: - if control_codes == 'strict': - raise ValueError(f"Unknown text sizing field '{key}' " - f"in OSC 66 sequence, {raw!r}") - # ignore unknown fields unless 'strict' - continue - try: - value = int(val) - except ValueError as exc: - if control_codes == 'strict': - raise ValueError(f"Illegal text sizing value '{val}' " - f"in OSC 66 sequence, {raw!r}: {exc}") from exc - # ignore value, uses default value without warning unless 'strict' - continue - if control_codes == 'strict' and (value > field.high or value < field.low): - raise ValueError(f"Out of bounds text sizing value '{val}' " - f"in OSC 66 sequence, {raw!r}: " - f"allowed range for '{key}' ({field.name}) " - f"is {field.low} to {field.high}") - kwargs[field.name] = max(field.low, min(field.high, value)) - return cls(**kwargs) - - -class TextSizing(typing.NamedTuple): - """Basic horizontal width measurement for kitty text sizing protocol.""" - - params: TextSizingParams - text: str - terminator: str - - @classmethod - def from_match(cls, match: re.Match[str], control_codes: str = 'parse') -> TextSizing: - r""" - Parse using matching OSC 66 Sequence. - - :param match: match object from :attr:`wcwidth.escape_sequences.TEXT_SIZING_PATTERN`. - :param control_codes: 'parse' or 'strict', same meaning as delegated by - :func:`wcwidth.width`. - :raises ValueError: When ``control_codes='strict'`` for unrecognized, invalid, or out of - bounds text sizing parameters. - :returns: TextSizing object from parsed sequence - - Example:: - - from wcwidth.escape_sequences import TEXT_SIZING_PATTERN - >>> TextSizing.from_match(TEXT_SIZING_PATTERN.match('\x1b]66;w=2;XY\x07')) - TextSizing(params=TextSizingParams(scale=1, width=2, numerator=0, denominator=0, \ - vertical_align=0, horizontal_align=0), text='XY', terminator='\x07') - """ - return cls(params=TextSizingParams.from_params(match.group(1), control_codes=control_codes), - text=match.group(2), - terminator=match.group(3)) - - def display_width(self, ambiguous_width: int = 1) -> int: - """ - Calculate the display width of a text sizing sequence. - - :param ambiguous_width: Width for East Asian Ambiguous characters. - :returns: Display width in terminal cells. When ``width > 0``, returns ``params.scale * - params.width``. When ``width == 0``, returns ``params.scale * measured_inner_width``. - """ - if self.params.width > 0: - return self.params.scale * self.params.width - w = wcswidth(self.text, ambiguous_width=ambiguous_width) - return self.params.scale * max(0, w) - - def make_sequence(self) -> str: - """Build and return complete OSC 66 Terminal Sequence.""" - return f'\x1b]66;{self.params.make_sequence()};{self.text}{self.terminator}' diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index dc72236..638d4a0 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -155,456 +155,4 @@ def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> if ambiguous_width == 2 and bisearch(ucs, _AMBIGUOUS_TABLE): return 2 -<<<<<<< HEAD - >>> wcwidth.rjust('hi', 5) - ' hi' - >>> wcwidth.rjust('\x1b[31mhi\x1b[0m', 5) - ' \x1b[31mhi\x1b[0m' - >>> wcwidth.rjust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) - ' 👨‍👩‍👧' - """ - if text.isascii() and text.isprintable(): - text_width = len(text) - else: - text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) - padding_cells = max(0, dest_width - text_width) - return fillchar * padding_cells + text - - -def center( - text: str, - dest_width: int, - fillchar: str = ' ', - *, - control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', - ambiguous_width: int = 1, -) -> str: - r""" - Return text centered in a string of given display width. - - :param text: String to center, may contain terminal sequences. - :param dest_width: Total display width of result in terminal cells. - :param fillchar: Single character for padding (default space). Must have - display width of 1 (not wide, not zero-width, not combining). Unicode - characters like ``'·'`` are acceptable. The width is not validated. - :param control_codes: How to handle control sequences when measuring. - Passed to :func:`width` for measurement. - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :returns: Text padded on both sides to reach ``dest_width``. - - For odd-width padding, the extra cell goes on the right (matching - Python's :meth:`str.center` behavior). - - .. versionadded:: 0.3.0 - - Example:: - - >>> wcwidth.center('hi', 6) - ' hi ' - >>> wcwidth.center('\x1b[31mhi\x1b[0m', 6) - ' \x1b[31mhi\x1b[0m ' - >>> wcwidth.center('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) - ' 👨‍👩‍👧 ' - """ - if text.isascii() and text.isprintable(): - text_width = len(text) - else: - text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) - total_padding = max(0, dest_width - text_width) - # matching https://jazcap53.github.io/pythons-eccentric-strcenter.html - left_pad = total_padding // 2 + (total_padding & dest_width & 1) - right_pad = total_padding - left_pad - return fillchar * left_pad + text + fillchar * right_pad - - -def clip( - text: str, - start: int, - end: int, - *, - fillchar: str = ' ', - tabsize: int = 8, - ambiguous_width: int = 1, - propagate_sgr: bool = True, -) -> str: - r""" - Clip text to display columns ``(start, end)`` while preserving all terminal sequences. - - This function extracts a substring based on visible column positions rather than - character indices. Terminal escape sequences are preserved in the output since - they have zero display width. If a wide character (width 2) would be split at - either boundary, it is replaced with ``fillchar``. - - TAB characters (``\t``) are expanded to spaces up to the next tab stop, - controlled by the ``tabsize`` parameter. - - Other cursor movement characters (backspace, carriage return) and cursor - movement sequences are passed through unchanged as zero-width. - - :param text: String to clip, may contain terminal escape sequences. - :param start: Absolute starting column (inclusive, 0-indexed). - :param end: Absolute ending column (exclusive). - :param fillchar: Character to use when a wide character must be split at - a boundary (default space). Must have display width of 1. - :param tabsize: Tab stop width (default 8). Set to 0 to pass tabs through - as zero-width (preserved in output but don't advance column position). - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :param propagate_sgr: If True (default), SGR (terminal styling) sequences - are propagated. The result begins with any active style at the start - position and ends with a reset sequence if styles are active. - :returns: Substring of ``text`` spanning display columns ``(start, end)``, - with all terminal sequences preserved and wide characters at boundaries - replaced with ``fillchar``. - - SGR (terminal styling) sequences are propagated by default. The result - begins with any active style and ends with a reset:: - - >>> clip('\x1b[1;34mHello world\x1b[0m', 6, 11) - '\x1b[1;34mworld\x1b[0m' - - Set ``propagate_sgr=False`` to disable this behavior. - - .. versionadded:: 0.3.0 - - .. versionchanged:: 0.5.0 - Added ``propagate_sgr`` parameter (default True). - - .. versionchanged:: 0.6.1 - Parses OSC 66 Sequences. - - Example:: - - >>> clip('hello world', 0, 5) - 'hello' - >>> clip('中文字', 0, 3) # Wide char split at column 3 - '中 ' - >>> clip('a\tb', 0, 10) # Tab expanded to spaces - 'a b' - """ - # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements,too-many-nested-blocks,W0101 - # Again, for 'hot path', we avoid additional delegate functions and accept the cost - # of complexity for improved python performance. - start = max(start, 0) - if end <= start: - return '' - - # Fast path: printable ASCII only (no tabs, escape sequences, or wide or zero-width chars) - if text.isascii() and text.isprintable(): - return text[start:end] - - # Fast path: no escape sequences means no SGR tracking needed - if propagate_sgr and '\x1b' not in text: - propagate_sgr = False - - # SGR tracking state (only when propagate_sgr=True) - # sgr_at_clip_start is sgr state when first visible char emitted (None = not yet) - sgr_at_clip_start = None - # current active sgr state - sgr = None # current SGR state, updated by matches of _SGR_PATTERN - if propagate_sgr: - sgr = _SGR_STATE_DEFAULT - - # Painter's algorithm data structures: - # 1. cells: maps column integer to a visible character (with its width) - # cells that are part of a wide character's right half are not populated. - # 2. sequences: maps column integer to a list of zero-width sequences emitted at that position - # and their chronological order number. - cells: dict[int, tuple[str, int]] = {} - sequences: list[tuple[int, int, str]] = [] # (col, seq_order, text) - seq_order = 0 # relative ordering of sequences - - col = 0 - idx = 0 - - def _write_cells(s: str, w: int, write_col: int) -> None: - nonlocal sgr_at_clip_start - if w > 0: - # Fix up wide-char orphans and clear overwritten cells in one pass - for offset in range(w): - src_col = write_col + offset - if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: - cells[src_col - 1] = (fillchar, 1) - if cells.get(src_col, ('', 0))[1] == 2: - cells[src_col + 1] = (fillchar, 1) - cells.pop(src_col, None) - cells[write_col] = (s, w) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - - def _append_seq(seq: str, at_col: int | None = None) -> None: - nonlocal sgr_at_clip_start, seq_order - c = col if at_col is None else at_col - sequences.append((c, seq_order, seq)) - seq_order += 1 - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - - while idx < len(text): - char = text[idx] - - # Early exit: past visible region, SGR captured, no escape ahead - if col >= end and sgr_at_clip_start is not None and char != '\x1b': - break - - # 1. Handle escape sequences and bare ESC - if char == '\x1b': - if (match := ZERO_WIDTH_PATTERN.match(text, idx)): - seq = match.group() - if (propagate_sgr and sgr) and _SGR_PATTERN.match(seq): - # Update SGR state; will be applied as prefix when visible content starts - sgr = _sgr_state_update(sgr, seq) - idx = match.end() - continue - - # Cursor-forward sequences (e.g. CSI n C) advance the column; - # simulate by emitting fillchars for the visible portion. - if (match_cforward := CURSOR_RIGHT_SEQUENCE.match(seq)): - digit_txt = match_cforward.group(1) - n_forward = int(digit_txt) if digit_txt else 1 - move_end = col + n_forward - if col < end and move_end > start: - for i in range(max(col, start), min(move_end, end)): - _write_cells(fillchar, 1, i) - col = move_end - idx = match.end() - continue - - # Cursor-backward sequences (e.g. CSI n D) retreat the column. - if (match_cbackward := CURSOR_LEFT_SEQUENCE.match(seq)): - digit_txt = match_cbackward.group(1) - n_backward = int(digit_txt) if digit_txt else 1 - col = max(0, col - n_backward) - idx = match.end() - continue - - if (ts_match := TEXT_SIZING_PATTERN.match(seq)): - # OSC 66 (text sizing) has positive width - col = _text_sizing_clip( - TextSizing.from_match(ts_match), - col=col, start=start, end=end, - write_cells=_write_cells, - fillchar=fillchar, ambiguous_width=ambiguous_width, - ) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - idx = match.end() - continue - - # Other zero-width sequences (OSC hyperlinks, etc.) are preserved as-is - _append_seq(seq) - idx = match.end() - continue - else: - # Bare ESC not matching any recognized sequence pattern - _append_seq(char) - idx += 1 - continue - - # 3. TAB expansion - if char == '\t': - if tabsize > 0: - next_tab = col + (tabsize - (col % tabsize)) - while col < next_tab: - if start <= col < end: - _write_cells(' ', 1, col) - col += 1 - else: - # preserve tab as-is - _append_seq(char) - idx += 1 - continue - - # 4. Grapheme clustering for everything else - grapheme = next(iter_graphemes(text, start=idx)) - grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) - - if grapheme_w == 0: - # combining/zero-width grapheme; preserve as token at this column - if start <= col < end: - _append_seq(grapheme) - elif col >= start and col + grapheme_w <= end: - # Fully visible - _write_cells(grapheme, grapheme_w, col) - elif col < end and col + grapheme_w > start: - # Partially visible (wide char at boundary) — emit fillchars - clip_start = max(start, col) - for i in range(min(end, col + grapheme_w) - clip_start): - _write_cells(fillchar, 1, clip_start + i) - # advance column whether visible or not - col += grapheme_w - idx += len(grapheme) - - # ── Reconstruct result from painter's algorithm grid ────────────────── - # Build column→sorted sequences index - seqs_by_col: dict[int, list[tuple[int, str]]] = {} - for col_pos, order, seq_text in sequences: - seqs_by_col.setdefault(col_pos, []).append((order, seq_text)) - for entries in seqs_by_col.values(): - entries.sort() - - max_cell_col = max(cells.keys()) if cells else -1 - max_seq_col = max(seqs_by_col.keys()) if seqs_by_col else -1 - max_col = max(max_cell_col, max_seq_col) - - # Walk columns 0..min(max_col, end), emitting sequences then any cell - # or fillchar occupying each position. Visits *inclusive* of - # min(max_col, end) so sequences at `end` are preserved. - parts: list[str] = [] - walk_col = 0 - col_limit = min(max_col, end) - while walk_col <= col_limit: - # Zero-width sequences at this column - for _, seq_text in seqs_by_col.get(walk_col, ()): - parts.append(seq_text) - - if walk_col >= end: - walk_col += 1 - continue - - if walk_col in cells: - cell_text, cell_w = cells[walk_col] - cell_end = walk_col + cell_w - - if walk_col >= start and cell_end <= end: - # Fully inside clip window - parts.append(cell_text) - elif cell_end > start: - # Partial overlap (wide char split at boundary) - parts.append(fillchar * (min(cell_end, end) - max(walk_col, start))) - # else: cell entirely before start — skip - - walk_col += cell_w - else: - # Hole: emit fillchar for columns inside [start, end) that - # lie within the written cell area - if walk_col >= start and walk_col <= max_cell_col: - parts.append(fillchar) - walk_col += 1 - - # Trailing sequences past col_limit (SGR resets after short text, etc.) - for c in sorted(seqs_by_col.keys()): - if c > col_limit: - for _, seq_text in seqs_by_col[c]: - parts.append(seq_text) - - result = ''.join(parts) - - # Apply SGR prefix/suffix - if sgr_at_clip_start is not None: - if prefix := _sgr_state_to_sequence(sgr_at_clip_start): - result = prefix + result - if _sgr_state_is_active(sgr_at_clip_start): - result += '\x1b[0m' - - return result - - -def _text_sizing_clip( - ts: TextSizing, - *, - col: int, - start: int, - end: int, - write_cells: Callable[[str, int, int], None], - fillchar: str = ' ', - ambiguous_width: int = 1, -) -> int: - """ - Emit tokens for a text-sizing (OSC 66) sequence, clipped to ``[start, end)``. - - Returns ``new_col`` (column position after the sequence). - """ - # pylint: disable=too-many-locals - ts_width = ts.display_width(ambiguous_width) - - # Sequence fully visible or fully outside: simple cases - if col >= start and col + ts_width <= end: - write_cells(ts.make_sequence(), ts_width, col) - return col + ts_width - if col >= end or col + ts_width <= start: - return col + ts_width - - # Partial overlap: the sequence straddles a clip boundary. - # Decompose into unit cells (each grapheme occupies `scale` cells), - # emit as many whole units as fit inside [start, end), filling the - # remainder with `fillchar`. - rel_start = max(0, start - col) - rel_end = min(end, col + ts_width) - col - scale = ts.params.scale - - # Build the list of (grapheme, cell_width) units - units: list[tuple[str, int]] = [] - if ts.params.width > 0: - # Fixed-width mode: explicit count at `scale` cells each. - # Use itertools.islice to avoid materializing the full grapheme list. - # std imports - from itertools import islice - for j, g in enumerate(islice(iter_graphemes(ts.text), ts.params.width)): - units.append((g, scale)) - # Pad with empty graphemes if text had fewer than width - for _ in range(ts.params.width - len(units)): - units.append(('', scale)) - else: - # Auto-width mode: grapheme count derived from content, width varies - for g in iter_graphemes(ts.text): - units.append((g, width(g, ambiguous_width=ambiguous_width) * scale)) - - # Batch of consecutive fully-visible units that can be emitted as a - # single text-sizing sequence. - pending_units: list[tuple[str, int]] = [] # (grapheme_text, cell_width) - - def flush(flush_col: int) -> None: - """Emit accumulated graphemes as one text-sizing sequence.""" - if not pending_units: - return - texts = [u[0] for u in pending_units] - total_w = sum(u[1] for u in pending_units) - params = TextSizingParams( - scale, - len(texts) if ts.params.width > 0 else 0, - ts.params.numerator, - ts.params.denominator, - ts.params.vertical_align, - ts.params.horizontal_align) - write_cells( - TextSizing(params, ''.join(texts), ts.terminator).make_sequence(), - total_w, - flush_col) - pending_units.clear() - - # Walk units in cell-coordinate space, collecting consecutive fully-visible - # ones into a batch (flushed as one sequence) and emitting fillchars for - # partial units at the boundaries. - flush_col_pos = col + rel_start - unit_pos = 0 # current position in cell-coordinates within the sequence - for unit_text, unit_w in units: - unit_end = unit_pos + unit_w - if unit_end <= rel_start: - # Unit is entirely before the clip window - unit_pos = unit_end - continue - if unit_pos >= rel_end: - # Unit is entirely past the clip window - break - - overlap = min(unit_end, rel_end) - max(unit_pos, rel_start) - if overlap == unit_w and unit_w > 0: - # Unit fits completely — batch it with others - if not pending_units: - flush_col_pos = col + max(unit_pos, rel_start) - pending_units.append((unit_text, unit_w)) - else: - # Unit is partially clipped — flush batch, emit fillchars for remainder - flush(flush_col_pos) - abs_start = col + max(unit_pos, rel_start) - for i in range(overlap): - write_cells(fillchar, 1, abs_start + i) - unit_pos = unit_end - - flush(flush_col_pos) - return col + ts_width -======= - return 1 ->>>>>>> jq/refactor + return 1 \ No newline at end of file diff --git a/wcwidth/width.py b/wcwidth/width.py index 82d8f78..e6220b6 100644 --- a/wcwidth/width.py +++ b/wcwidth/width.py @@ -12,7 +12,6 @@ _FITZPATRICK_RANGE, _REGIONAL_INDICATOR_SET) from .table_vs16 import VS16_NARROW_TO_WIDE -from .text_sizing import TextSizing, TextSizingParams from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL from .table_grapheme import ISC_CONSONANT from .escape_sequences import _SEQUENCE_CLASSIFY, INDETERMINATE_EFFECT_SEQUENCE, strip_sequences @@ -61,8 +60,8 @@ def width( - ``'parse'`` (default): Track horizontal cursor movement like BS ``\b``, CR ``\r``, TAB ``\t``, cursor left and right movement sequences. Vertical movement (LF, VT, FF) and - indeterminate terminal sequences are zero-width. OSC 66 Kitty Text Sizing protocol, OSC 8 - Hyperlink, and many other kinds of output sequences are parsed for displayed measurements. + indeterminate terminal sequences are zero-width. OSC 8 Hyperlink, and many other kinds + of output sequences are parsed for displayed measurements. - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with indeterminate results of the screen or cursor, like clear or vertical movement. Generally, these should be handled with a virtual terminal emulator (like 'pyte'). @@ -157,19 +156,11 @@ def width( if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): raise ValueError(f"Indeterminate cursor sequence at position {idx}, {seq!r}") - # 2b. cursor forward, backward, and OSC 66 text sizing width + # 2b. cursor forward, backward if (cforward_n := m.group('cforward_n')) is not None: current_col += int(cforward_n) if cforward_n else 1 elif (cbackward_n := m.group('cbackward_n')) is not None: current_col = max(0, current_col - (int(cbackward_n) if cbackward_n else 1)) - elif (ts_meta := m.group('ts_meta')) is not None: - ts_text = m.group('ts_text') - ts_term = m.group('ts_term') - assert ts_text is not None and ts_term is not None - text_size = TextSizing( - TextSizingParams.from_params(ts_meta, control_codes=control_codes), - ts_text, ts_term) - current_col += text_size.display_width(ambiguous_width) # 2c. SGR and other zero-width sequences -- no column advance idx = m.end() max_extent = max(max_extent, current_col) From 7cc284535e775e53e12009eee7fa0d4109ae975f Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 03:30:28 -0400 Subject: [PATCH 39/70] format --- wcwidth/escape_sequences.py | 2 +- wcwidth/wcwidth.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index 806fdc1..8cb5cc2 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -162,4 +162,4 @@ def strip_sequences(text: str) -> str: >>> strip_sequences('\x1b]8;id=34;https://example.com\x1b\\[view]\x1b]8;;\x1b\\') '[view]' """ - return ZERO_WIDTH_PATTERN.sub('', text) \ No newline at end of file + return ZERO_WIDTH_PATTERN.sub('', text) diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 638d4a0..c055fb7 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -155,4 +155,4 @@ def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> if ambiguous_width == 2 and bisearch(ucs, _AMBIGUOUS_TABLE): return 2 - return 1 \ No newline at end of file + return 1 From c5fa80f07e7763a1f9fd85b9578341d9566b8477 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 03:48:41 -0400 Subject: [PATCH 40/70] rename files to prevent conflict --- tests/test_benchmarks.py | 2 +- tests/test_core.py | 4 ++-- wcwidth/__init__.py | 8 ++++---- wcwidth/{clip.py => _clip.py} | 2 +- wcwidth/{wcswidth.py => _wcswidth.py} | 2 +- wcwidth/{wcwidth.py => _wcwidth.py} | 0 wcwidth/{width.py => _width.py} | 4 ++-- wcwidth/align.py | 2 +- wcwidth/textwrap.py | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) rename wcwidth/{clip.py => _clip.py} (99%) rename wcwidth/{wcswidth.py => _wcswidth.py} (99%) rename wcwidth/{wcwidth.py => _wcwidth.py} (100%) rename wcwidth/{width.py => _width.py} (99%) diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 813e934..f2ceee6 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -11,7 +11,7 @@ # local import wcwidth -_width_module = sys.modules['wcwidth.width'] +_width_module = sys.modules['wcwidth._width'] def test_wcwidth_ascii(benchmark): diff --git a/tests/test_core.py b/tests/test_core.py index f825050..3dc1479 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -9,9 +9,9 @@ # local import wcwidth -from wcwidth.width import _WIDTH_FAST_PATH_MIN_LEN +from wcwidth._width import _WIDTH_FAST_PATH_MIN_LEN -_wcwidth_module = sys.modules['wcwidth.wcwidth'] +_wcwidth_module = sys.modules['wcwidth._wcwidth'] # local diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 5020d2c..d38e383 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -9,14 +9,14 @@ # documented as public API # local -from .clip import clip +from ._clip import clip from .align import ljust, rjust, center -from .width import width -from .wcwidth import wcwidth, _wcmatch_version, _wcversion_value +from ._width import width +from ._wcwidth import wcwidth, _wcmatch_version, _wcversion_value from .bisearch import bisearch as _bisearch from .grapheme import iter_graphemes, iter_graphemes_reverse, grapheme_boundary_before from .textwrap import SequenceTextWrapper, wrap -from .wcswidth import wcswidth +from ._wcswidth import wcswidth from .sgr_state import propagate_sgr from .table_vs16 import VS16_NARROW_TO_WIDE from .table_wide import WIDE_EASTASIAN diff --git a/wcwidth/clip.py b/wcwidth/_clip.py similarity index 99% rename from wcwidth/clip.py rename to wcwidth/_clip.py index 97c735e..f6febf8 100644 --- a/wcwidth/clip.py +++ b/wcwidth/_clip.py @@ -4,7 +4,7 @@ from typing import Optional, NamedTuple # local -from .width import width +from ._width import width from .grapheme import iter_graphemes from .sgr_state import (_SGR_STATE_DEFAULT, _sgr_state_update, diff --git a/wcwidth/wcswidth.py b/wcwidth/_wcswidth.py similarity index 99% rename from wcwidth/wcswidth.py rename to wcwidth/_wcswidth.py index 14b3250..423a6af 100644 --- a/wcwidth/wcswidth.py +++ b/wcwidth/_wcswidth.py @@ -3,7 +3,7 @@ import typing # local -from .wcwidth import wcwidth +from ._wcwidth import wcwidth from .bisearch import bisearch from ._constants import (_EMOJI_ZWJ_SET, _ISC_VIRAMA_SET, diff --git a/wcwidth/wcwidth.py b/wcwidth/_wcwidth.py similarity index 100% rename from wcwidth/wcwidth.py rename to wcwidth/_wcwidth.py diff --git a/wcwidth/width.py b/wcwidth/_width.py similarity index 99% rename from wcwidth/width.py rename to wcwidth/_width.py index e6220b6..9c88083 100644 --- a/wcwidth/width.py +++ b/wcwidth/_width.py @@ -3,9 +3,9 @@ from typing import Literal # local -from .wcwidth import wcwidth +from ._wcwidth import wcwidth from .bisearch import bisearch -from .wcswidth import wcswidth +from ._wcswidth import wcswidth from ._constants import (_EMOJI_ZWJ_SET, _ISC_VIRAMA_SET, _CATEGORY_MC_TABLE, diff --git a/wcwidth/align.py b/wcwidth/align.py index ddb6886..abc38e7 100644 --- a/wcwidth/align.py +++ b/wcwidth/align.py @@ -2,7 +2,7 @@ from typing import Literal # local -from .width import width +from ._width import width def ljust( diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index 9302b15..655910a 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -15,7 +15,7 @@ from typing import TYPE_CHECKING, Optional, NamedTuple # local -from .width import width as wcwidth_width +from ._width import width as wcwidth_width from .grapheme import iter_graphemes from .sgr_state import propagate_sgr as _propagate_sgr from .escape_sequences import ZERO_WIDTH_PATTERN, iter_sequences From bf1676abfd82bd699e43daa623c7b08306811d2d Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 04:13:15 -0400 Subject: [PATCH 41/70] bugger fixer --- tests/test_textwrap.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_textwrap.py b/tests/test_textwrap.py index 1464f26..33da72a 100644 --- a/tests/test_textwrap.py +++ b/tests/test_textwrap.py @@ -211,9 +211,7 @@ def test_wrap_unicode(benchmark, text, w, expected): ['x\x1b[31mab\x1b[0m', '\x1b[31mcde\x1b[0m', '\x1b[31mfgh\x1b[0m', '\x1b[31mij\x1b[0m']), # Fs sequence (ESC d) - zero-width, stays with preceding text ('abc\x1bdefghij', 3, ['abc\x1bd', 'efg', 'hij']), -] - -@pytest.mark.parametrize('text,w,expected', SEQUENCE_CASES) +]) def test_wrap_sequences(benchmark, text, w, expected): """Escape sequence preservation (with propagate_sgr=True default)""" assert benchmark(wrap, text, w) == expected From e0da55401e2e16e6a74707943e1495425e265127 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 04:16:26 -0400 Subject: [PATCH 42/70] Add legacy import helper --- wcwidth/wcwidth.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 wcwidth/wcwidth.py diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py new file mode 100644 index 0000000..6afb3ac --- /dev/null +++ b/wcwidth/wcwidth.py @@ -0,0 +1,68 @@ +""" +Legacy compatibility module for wcwidth.wcwidth. + +This file contains no new definitions and is provided only for backwards +compatibility. This module exists solely to support legacy import paths:: + + from wcwidth.wcwidth import iter_graphemes + from wcwidth.wcwidth import _SGR_PATTERN +""" +# pylint: disable=unused-import + +# local +from ._clip import clip +from .align import ljust, rjust, center +from ._width import _CONTROL_CHAR_TABLE, _WIDTH_FAST_PATH_MIN_LEN, width, _width_ignored_codes +from ._wcwidth import wcwidth, _wcmatch_version, _wcversion_value +from .bisearch import bisearch as _bisearch +from .grapheme import iter_graphemes +from .table_mc import CATEGORY_MC +from ._wcswidth import wcswidth +from .sgr_state import (_SGR_PATTERN, + _SGR_STATE_DEFAULT, + _sgr_state_update, + _sgr_state_is_active, + _sgr_state_to_sequence) +from ._constants import (_EMOJI_ZWJ_SET, + _ISC_VIRAMA_SET, + _LATEST_VERSION, + _AMBIGUOUS_TABLE, + _ZERO_WIDTH_TABLE, + _CATEGORY_MC_TABLE, + _FITZPATRICK_RANGE, + _WIDE_EASTASIAN_TABLE, + _REGIONAL_INDICATOR_SET) +from .table_vs16 import VS16_NARROW_TO_WIDE +from .table_wide import WIDE_EASTASIAN +from .table_zero import ZERO_WIDTH +from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL +from .table_grapheme import ISC_CONSONANT, EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR +from .table_ambiguous import AMBIGUOUS_EASTASIAN +from .escape_sequences import (ZERO_WIDTH_PATTERN, + CURSOR_LEFT_SEQUENCE, + CURSOR_RIGHT_SEQUENCE, + INDETERMINATE_EFFECT_SEQUENCE, + iter_sequences, + strip_sequences) +from .unicode_versions import list_versions + +_ISC_CONSONANT_TABLE = ISC_CONSONANT + +__all__ = ( + 'ZERO_WIDTH', + 'WIDE_EASTASIAN', + 'AMBIGUOUS_EASTASIAN', + 'VS16_NARROW_TO_WIDE', + 'list_versions', + 'wcwidth', + 'wcswidth', + 'width', + 'iter_sequences', + 'ljust', + 'rjust', + 'center', + 'clip', + 'strip_sequences', + '_wcmatch_version', + '_wcversion_value', +) From d214ef8aff356b52abb08775b54ee9aa6b51cd10 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 04:21:55 -0400 Subject: [PATCH 43/70] unused reference --- tests/test_core.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 3dc1479..0ff8c10 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -11,9 +11,6 @@ import wcwidth from wcwidth._width import _WIDTH_FAST_PATH_MIN_LEN -_wcwidth_module = sys.modules['wcwidth._wcwidth'] -# local - def test_package_version(): """wcwidth.__version__ is expected value.""" From 0e232a702e5fbc04eac4ce9503c1b409989c448a Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 04:22:36 -0400 Subject: [PATCH 44/70] nit --- tests/test_core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 0ff8c10..3208cbd 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,7 +1,5 @@ """Core tests for wcwidth module.""" - # std imports -import sys import importlib.metadata # 3rd party From 36d654b48825ebc963fc2b43618d9da1203ad399 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 05:08:32 -0400 Subject: [PATCH 45/70] make painter's algorithm optional by cursor sequence movement --- tox.ini | 1 + wcwidth/_clip.py | 388 ++++++++++++++++++++++-------------- wcwidth/_width.py | 9 +- wcwidth/escape_sequences.py | 10 + 4 files changed, 250 insertions(+), 158 deletions(-) diff --git a/tox.ini b/tox.ini index 8c5d19a..7c4d526 100644 --- a/tox.ini +++ b/tox.ini @@ -55,6 +55,7 @@ relative_files = True [coverage:report] omit = tests/* + wcwidth/wcwidth.py exclude_lines = pragma: no cover precision = 1 diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index f6febf8..eed31dd 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -10,7 +10,7 @@ _sgr_state_update, _sgr_state_is_active, _sgr_state_to_sequence) -from .escape_sequences import _SEQUENCE_CLASSIFY +from .escape_sequences import _SEQUENCE_CLASSIFY, _HORIZONTAL_CURSOR_MOVEMENT class VisToken(NamedTuple): @@ -102,6 +102,10 @@ def clip( if propagate_sgr and '\x1b' not in text: propagate_sgr = False + # Use painter's algorithm only when cursor movement (BS, CR, CSI C/D) can overwrite cells. + # Text without any horizontal movement uses a fast direct-append path. + use_painter = bool(_HORIZONTAL_CURSOR_MOVEMENT.search(text)) + # SGR tracking state (only when propagate_sgr=True) sgr_at_clip_start is # sgr state when first visible char emitted (None = not yet) sgr_at_clip_start = None @@ -110,171 +114,245 @@ def clip( if propagate_sgr: sgr = _SGR_STATE_DEFAULT - # Painter's algorithm data structures: - # map column integer to a visible character (with its width) - cells: dict[int, tuple[str, int]] = {} - # map column integer to a list of zero-width sequences emitted at that position - # (col, seq_order, text) - sequences: list[tuple[int, int, str]] = [] - # ordering of sequences - seq_order = 0 - - col = 0 - idx = 0 - - def _write_cells(s: str, w: int, write_col: int) -> None: - nonlocal sgr_at_clip_start - if w > 0: - # Fix up wide-char orphans and clear overwritten cells in one pass - for offset in range(w): - src_col = write_col + offset - if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: - cells[src_col - 1] = (fillchar, 1) - if cells.get(src_col, ('', 0))[1] == 2: - cells[src_col + 1] = (fillchar, 1) - cells.pop(src_col, None) - cells[write_col] = (s, w) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - - def _append_seq(seq: str, at_col: Optional[int] = None) -> None: - nonlocal sgr_at_clip_start, seq_order - c = col if at_col is None else at_col - sequences.append((c, seq_order, seq)) - seq_order += 1 - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - - while idx < len(text): - char = text[idx] - - # Early exit: past visible region, SGR captured, no escape ahead - if col >= end and sgr_at_clip_start is not None and char != '\x1b': - break - - # 1. Handle escape sequences and bare ESC — single regex dispatch - if char == '\x1b': - m = _SEQUENCE_CLASSIFY.match(text, idx) - if not m: - _append_seq(char) + if not use_painter: + # Simple path: no cursor movement — direct output.append() is sufficient. + # This matches the original (master-branch) clip performance characteristics. + output: list[str] = [] + col = 0 + idx = 0 + + while idx < len(text): + char = text[idx] + + # Early exit: past visible region, SGR captured, no escape ahead + if col >= end and sgr_at_clip_start is not None and char != '\x1b': + break + + # Handle escape sequences + if char == '\x1b': + m = _SEQUENCE_CLASSIFY.match(text, idx) + if not m: + output.append(char) + idx += 1 + continue + + # SGR handling: update state, don't emit sequence + if m.group('sgr_params') is not None and propagate_sgr and sgr: + sgr = _sgr_state_update(sgr, m.group()) + idx = m.end() + continue + + # Any other recognized sequence preserved as-is + output.append(m.group()) + idx = m.end() + continue + + # TAB expansion + if char == '\t': + if tabsize > 0: + next_tab = col + (tabsize - (col % tabsize)) + while col < next_tab: + if start <= col < end: + output.append(' ') + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + col += 1 + else: + output.append(char) idx += 1 continue - # Dispatch on which named group captured: - if (m.group('sgr_params')) is not None and (propagate_sgr and sgr): - sgr = _sgr_state_update(sgr, m.group()) + # Grapheme clustering for everything else + grapheme = next(iter_graphemes(text, start=idx)) + grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) + + if grapheme_w == 0: + # combining/zero-width grapheme; preserve as token at this column + if start <= col < end: + output.append(grapheme) + elif col >= start and col + grapheme_w <= end: + # Fully visible + output.append(grapheme) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + elif col < end and col + grapheme_w > start: + # Partially visible (wide char at boundary) — emit fillchars + output.append(fillchar * (min(end, col + grapheme_w) - max(start, col))) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + # advance column whether visible or not + col += grapheme_w + idx += len(grapheme) + + result = ''.join(output) + else: + # Painter's algorithm path: handles cursor movement (BS, CR, CSI C/D) + # that can overwrite previously emitted cells. + + # map column integer to a visible character (with its width) + cells: dict[int, tuple[str, int]] = {} + # map column integer to a list of zero-width sequences emitted at that position + # (col, seq_order, text) + sequences: list[tuple[int, int, str]] = [] + # ordering of sequences + seq_order = 0 + + col = 0 + idx = 0 + + def _write_cells(s: str, w: int, write_col: int) -> None: + nonlocal sgr_at_clip_start + if w > 0: + # Fix up wide-char orphans and clear overwritten cells in one pass + for offset in range(w): + src_col = write_col + offset + if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: + cells[src_col - 1] = (fillchar, 1) + if cells.get(src_col, ('', 0))[1] == 2: + cells[src_col + 1] = (fillchar, 1) + cells.pop(src_col, None) + cells[write_col] = (s, w) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + + def _append_seq(seq: str, at_col: Optional[int] = None) -> None: + nonlocal sgr_at_clip_start, seq_order + c = col if at_col is None else at_col + sequences.append((c, seq_order, seq)) + seq_order += 1 + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + + while idx < len(text): + char = text[idx] + + # Early exit: past visible region, SGR captured, no escape ahead + if col >= end and sgr_at_clip_start is not None and char != '\x1b': + break + + # 1. Handle escape sequences and bare ESC — single regex dispatch + if char == '\x1b': + m = _SEQUENCE_CLASSIFY.match(text, idx) + if not m: + _append_seq(char) + idx += 1 + continue + + # Dispatch on which named group captured: + if (m.group('sgr_params')) is not None and (propagate_sgr and sgr): + sgr = _sgr_state_update(sgr, m.group()) + idx = m.end() + continue + + # 1a. Cursor forward, + if (cforward_n := m.group('cforward_n')) is not None: + n_forward = int(cforward_n) if cforward_n else 1 + move_end = col + n_forward + if col < end and move_end > start: + for i in range(max(col, start), min(move_end, end)): + _write_cells(fillchar, 1, i) + col = move_end + idx = m.end() + continue + + # 1b. Cursor backward, + if (cbackward_n := m.group('cbackward_n')) is not None: + n_backward = int(cbackward_n) if cbackward_n else 1 + col = max(0, col - n_backward) + idx = m.end() + continue + + # 1c. Any other recognized zero-width sequence + _append_seq(m.group()) idx = m.end() continue - # 1a. Cursor forward, - if (cforward_n := m.group('cforward_n')) is not None: - n_forward = int(cforward_n) if cforward_n else 1 - move_end = col + n_forward - if col < end and move_end > start: - for i in range(max(col, start), min(move_end, end)): - _write_cells(fillchar, 1, i) - col = move_end - idx = m.end() + # 2. TAB expansion + if char == '\t': + if tabsize > 0: + next_tab = col + (tabsize - (col % tabsize)) + while col < next_tab: + if start <= col < end: + _write_cells(' ', 1, col) + col += 1 + else: + # preserve tab as-is + _append_seq(char) + idx += 1 continue - # 1b. Cursor backward, - if (cbackward_n := m.group('cbackward_n')) is not None: - n_backward = int(cbackward_n) if cbackward_n else 1 - col = max(0, col - n_backward) - idx = m.end() + # 3. Grapheme clustering for everything else + grapheme = next(iter_graphemes(text, start=idx)) + grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) + + if grapheme_w == 0: + # combining/zero-width grapheme; preserve as token at this column + if start <= col < end: + _append_seq(grapheme) + elif col >= start and col + grapheme_w <= end: + # Fully visible + _write_cells(grapheme, grapheme_w, col) + elif col < end and col + grapheme_w > start: + # Partially visible (wide char at boundary) — emit fillchars + clip_start = max(start, col) + for i in range(min(end, col + grapheme_w) - clip_start): + _write_cells(fillchar, 1, clip_start + i) + # advance column whether visible or not + col += grapheme_w + idx += len(grapheme) + + # Reconstruct result from "painter's algorithm", this allows us to + # accurately depict clipping with horizontal movement + seqs_by_col: dict[int, list[tuple[int, str]]] = {} + for col_pos, order, seq_text in sequences: + seqs_by_col.setdefault(col_pos, []).append((order, seq_text)) + for entries in seqs_by_col.values(): + entries.sort() + + max_cell_col = max(cells.keys()) if cells else -1 + max_seq_col = max(seqs_by_col.keys()) if seqs_by_col else -1 + max_col = max(max_cell_col, max_seq_col) + + # Walk columns 0..min(max_col, end), emitting sequences then any cell + # or fillchar occupying each position. Visits *inclusive* of + # min(max_col, end) so sequences at `end` are preserved. + parts: list[str] = [] + walk_col = 0 + col_limit = min(max_col, end) + while walk_col <= col_limit: + # Zero-width sequences at this column + for _, seq_text in seqs_by_col.get(walk_col, ()): + parts.append(seq_text) + + if walk_col >= end: + walk_col += 1 continue - # 1c. Any other recognized zero-width sequence - _append_seq(m.group()) - idx = m.end() - continue - - # 2. TAB expansion - if char == '\t': - if tabsize > 0: - next_tab = col + (tabsize - (col % tabsize)) - while col < next_tab: - if start <= col < end: - _write_cells(' ', 1, col) - col += 1 + if walk_col in cells: + cell_text, cell_w = cells[walk_col] + cell_end = walk_col + cell_w + + if walk_col >= start and cell_end <= end: + # Fully inside clip window + parts.append(cell_text) + elif cell_end > start: + # Partial overlap (wide char split at boundary) + parts.append(fillchar * (min(cell_end, end) - max(walk_col, start))) + walk_col += cell_w else: - # preserve tab as-is - _append_seq(char) - idx += 1 - continue - - # 3. Grapheme clustering for everything else - grapheme = next(iter_graphemes(text, start=idx)) - grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) - - if grapheme_w == 0: - # combining/zero-width grapheme; preserve as token at this column - if start <= col < end: - _append_seq(grapheme) - elif col >= start and col + grapheme_w <= end: - # Fully visible - _write_cells(grapheme, grapheme_w, col) - elif col < end and col + grapheme_w > start: - # Partially visible (wide char at boundary) — emit fillchars - clip_start = max(start, col) - for i in range(min(end, col + grapheme_w) - clip_start): - _write_cells(fillchar, 1, clip_start + i) - # advance column whether visible or not - col += grapheme_w - idx += len(grapheme) - - # Reconstruct result from "painter's algorithm", this allows us to - # accurately depict clipping with horizontal movement - seqs_by_col: dict[int, list[tuple[int, str]]] = {} - for col_pos, order, seq_text in sequences: - seqs_by_col.setdefault(col_pos, []).append((order, seq_text)) - for entries in seqs_by_col.values(): - entries.sort() - - max_cell_col = max(cells.keys()) if cells else -1 - max_seq_col = max(seqs_by_col.keys()) if seqs_by_col else -1 - max_col = max(max_cell_col, max_seq_col) - - # Walk columns 0..min(max_col, end), emitting sequences then any cell - # or fillchar occupying each position. Visits *inclusive* of - # min(max_col, end) so sequences at `end` are preserved. - parts: list[str] = [] - walk_col = 0 - col_limit = min(max_col, end) - while walk_col <= col_limit: - # Zero-width sequences at this column - for _, seq_text in seqs_by_col.get(walk_col, ()): - parts.append(seq_text) - - if walk_col >= end: - walk_col += 1 - continue - - if walk_col in cells: - cell_text, cell_w = cells[walk_col] - cell_end = walk_col + cell_w - - if walk_col >= start and cell_end <= end: - # Fully inside clip window - parts.append(cell_text) - elif cell_end > start: - # Partial overlap (wide char split at boundary) - parts.append(fillchar * (min(cell_end, end) - max(walk_col, start))) - walk_col += cell_w - else: - # Hole: emit fillchar for columns inside (start, end) that lie - # within the written cell area - if start <= walk_col <= max_cell_col: - parts.append(fillchar) - walk_col += 1 - - # Trailing sequences past col_limit (SGR resets after short text, etc.) - for c in sorted(seqs_by_col.keys()): - if c > col_limit: - for _, seq_text in seqs_by_col[c]: - parts.append(seq_text) + # Hole: emit fillchar for columns inside (start, end) that lie + # within the written cell area + if start <= walk_col <= max_cell_col: + parts.append(fillchar) + walk_col += 1 + + # Trailing sequences past col_limit (SGR resets after short text, etc.) + for c in sorted(seqs_by_col.keys()): + if c > col_limit: + for _, seq_text in seqs_by_col[c]: + parts.append(seq_text) - result = ''.join(parts) + result = ''.join(parts) # Apply SGR prefix/suffix if sgr_at_clip_start is not None: @@ -283,4 +361,4 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: if _sgr_state_is_active(sgr_at_clip_start): result += '\x1b[0m' - return result + return result \ No newline at end of file diff --git a/wcwidth/_width.py b/wcwidth/_width.py index 9c88083..ca0d7be 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -14,7 +14,10 @@ from .table_vs16 import VS16_NARROW_TO_WIDE from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL from .table_grapheme import ISC_CONSONANT -from .escape_sequences import _SEQUENCE_CLASSIFY, INDETERMINATE_EFFECT_SEQUENCE, strip_sequences +from .escape_sequences import (_SEQUENCE_CLASSIFY, + CURSOR_MOVEMENT_SEQUENCE, + INDETERMINATE_EFFECT_SEQUENCE, + strip_sequences) # In 'parse' mode, strings longer than this are checked for cursor-movement # controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to @@ -117,8 +120,8 @@ def width( if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN: # Check for cursor-affecting control characters if '\b' not in text and '\t' not in text and '\r' not in text: - # Check for escape sequences that can't be ignored, if present - if '\x1b' not in text or not _SEQUENCE_CLASSIFY.search(text): + # Check for escape sequences - if none contain cursor movement + if '\x1b' not in text or not CURSOR_MOVEMENT_SEQUENCE.search(text): control_codes = 'ignore' # Fast path for ignore mode, useful if you know the text is already free of control codes diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index b8b5012..1fd65b4 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -45,6 +45,16 @@ # Cursor left movement: CSI [n] D, parameter may be parsed by width() CURSOR_LEFT_SEQUENCE = re.compile(r'\x1b\[(\d*)D') +# Combined cursor movement: single regex for fast-path detection of any +# horizontal cursor movement (left or right). Avoids two separate search() +# calls in hot-path width() and clip() pre-checks. +CURSOR_MOVEMENT_SEQUENCE = re.compile(r'\x1b\[(\d*)[CD]') + +# Combined horizontal cursor movement: matches BS, CR, and CSI C/D cursor sequences +# in a single regex pass. Used by clip() to decide between the simple append path +# and the painter's algorithm. +_HORIZONTAL_CURSOR_MOVEMENT = re.compile(r'[\b\r]|\x1b\[(\d*)[CD]') + # Combined pattern: a single regex that matches any zero-width escape sequence # and classifies it via named groups, aprox 2x faster than redundant re.matches # in clip() and width(). From b4fbb7a9c19d82193197a71d7c7d7253ed8f0c98 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 05:22:20 -0400 Subject: [PATCH 46/70] use \x08 as suggested by co-pilot, i guess --- wcwidth/escape_sequences.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index 1fd65b4..7d5c7cb 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -53,7 +53,7 @@ # Combined horizontal cursor movement: matches BS, CR, and CSI C/D cursor sequences # in a single regex pass. Used by clip() to decide between the simple append path # and the painter's algorithm. -_HORIZONTAL_CURSOR_MOVEMENT = re.compile(r'[\b\r]|\x1b\[(\d*)[CD]') +_HORIZONTAL_CURSOR_MOVEMENT = re.compile(r'[\x08\r]|\x1b\[(\d*)[CD]') # Combined pattern: a single regex that matches any zero-width escape sequence # and classifies it via named groups, aprox 2x faster than redundant re.matches From 9d8419ed3bf86859c8034712c0c5fdc1042510cf Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 13:23:24 -0400 Subject: [PATCH 47/70] basic support for \r and hpa --- docs/intro.rst | 5 +- tests/test_clip.py | 8 +-- tests/test_clip_cursors.py | 114 ++++++++++++++++++++++++------------ tests/test_width.py | 28 ++++++++- wcwidth/_clip.py | 64 +++++++++++++------- wcwidth/_width.py | 22 ++++++- wcwidth/escape_sequences.py | 13 ++-- 7 files changed, 182 insertions(+), 72 deletions(-) diff --git a/docs/intro.rst b/docs/intro.rst index 4c05214..2e7b3c4 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -469,7 +469,10 @@ History ======= 0.7.0 *2026-04-30* - * **Improved** `clip()` to support backward cursor sequence overwrite, "Painter's algorithm". + * **Improved** `clip()` and `width()` to support horizontal cursor sequences, especially + cursor_left (``cub``) can now overwrite previous rows, matching terminal behavior. + column_address (``hpa``) and carriage return (``\r``) are now parsed to move to beginning + of string, or, raise ValueError on ``strict``. 0.6.0 *2026-02-06* * **New** Parameters ``expand_tabs``, ``replace_whitespace``, ``fix_sentence_endings``, diff --git a/tests/test_clip.py b/tests/test_clip.py index 294421f..0a201cf 100644 --- a/tests/test_clip.py +++ b/tests/test_clip.py @@ -232,11 +232,11 @@ def test_clip_tab_with_sequences(): CLIP_CONTROL_CHAR_CASES = [ - ('abc\bde', 0, 5, 'abc\bde'), - ('ab\acd', 0, 4, 'ab\acd'), + ('abc\bde', 0, 5, 'abde'), + ('ab\acd', 0, 4, 'ab\x07cd'), ('ab\x00cd', 0, 4, 'ab\x00cd'), - ('abc\rde', 0, 5, 'abc\rde'), - ('\a\b\rHello', 0, 5, '\a\b\rHello'), + ('abc\rde', 0, 5, 'dec'), + ('\a\b\rHello', 0, 5, '\x07Hello'), ('ab\x01\x02cd', 0, 4, 'ab\x01\x02cd'), ('ab\x1b\x00cd', 0, 4, 'ab\x1b\x00cd'), ] diff --git a/tests/test_clip_cursors.py b/tests/test_clip_cursors.py index 3da99b2..5b9f716 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_cursors.py @@ -12,49 +12,89 @@ from wcwidth import clip -@pytest.mark.parametrize("text,start,end,expected", [ +@pytest.mark.parametrize("text,start,end,kwargs,expected", [ # Cursor-right introduces a gap that should be filled with spaces - ("hello\x1b[10Cworld", 0, 10, "hello" + " " * 5), + ("hello\x1b[10Cworld", 0, 10, {}, "hello" + " " * 5), # Clipping just the initial region ignores the later rightward write - ("hello\x1b[10Cworld", 0, 5, "hello"), + ("hello\x1b[10Cworld", 0, 5, {}, "hello"), # Cursor-left overwrites previous characters - ("hello\x1b[2DXY", 0, 5, "helXY"), + ("hello\x1b[2DXY", 0, 5, {}, "helXY"), # Cursor-left overwrites entire visible token - ("abc\x1b[3DXY", 0, 5, "XYc"), + ("abc\x1b[3DXY", 0, 5, {}, "XYc"), # Cursor-left at column 0 (prev_col not > col, no overwrite) - ("\x1b[2Dhi", 0, 2, "hi"), + ("\x1b[2Dhi", 0, 2, {}, "hi"), # Cursor-left with no visible tokens emitted - ("\x1b[5C\x1b[2Dhi", 5, 7, ""), + ("\x1b[5C\x1b[2Dhi", 5, 7, {}, ""), # Cursor-left overwrites text, seq tokens preserve column spatial order - ("ab\x1b]8;;http://x.com\x07\x1b[2Dcd", 0, 4, "cd\x1b]8;;http://x.com\x07"), + ("ab\x1b]8;;http://x.com\x07\x1b[2Dcd", 0, 4, {}, "cd\x1b]8;;http://x.com\x07"), # Cursor-left into wide char twice, second time on empty token triggers i < 0 break - ("中\x1b[D\x1b[Da", 0, 4, "a "), - ('ab\x1b[5Ccd', 0, 4, 'ab '), - ('abcde\x1b[2Df', 0, 6, 'abcfe'), - ('hello\x1b[5Dw', 0, 5, 'wello'), - ('ab\x1b[10Ccd', 0, 4, 'ab '), - ('XY\x1b[Czy', 0, 4, 'XY z'), - ('XY\x1b[Czy', 0, 5, 'XY zy'), - ('XY\x1b[Czy', 1, 3, 'Y '), - ('XY\x1b[Czy', 1, 4, 'Y z'), - ('LOL\x1b[5Clol', 0, 12, 'LOL lol'), - ('LOL\x1b[5Clol', 1, 11, 'OL lol'), - ('LOL\x1b[5Clol', 2, 11, 'L lol'), - ('LOL\x1b[5Clol', 3, 11, ' lol'), - ('LOL\x1b[5Clol', 4, 11, ' lol'), - ('LOL\x1b[5Clol', 5, 11, ' lol'), - ('LOL\x1b[5Clol', 6, 11, ' lol'), - ('LOL\x1b[5Clol', 7, 11, ' lol'), - ('LOL\x1b[5Clol', 8, 11, 'lol'), - ('LOL\x1b[5Clol', 9, 11, 'ol'), - + ("中\x1b[D\x1b[Da", 0, 4, {}, "a "), + ('ab\x1b[5Ccd', 0, 4, {}, 'ab '), + ('abcde\x1b[2Df', 0, 6, {}, 'abcfe'), + ('hello\x1b[5Dw', 0, 5, {}, 'wello'), + ('ab\x1b[10Ccd', 0, 4, {}, 'ab '), + ('XY\x1b[Czy', 0, 4, {}, 'XY z'), + ('XY\x1b[Czy', 0, 5, {}, 'XY zy'), + ('XY\x1b[Czy', 1, 3, {}, 'Y '), + ('XY\x1b[Czy', 1, 4, {}, 'Y z'), + ('LOL\x1b[5Clol', 0, 12, {}, 'LOL lol'), + ('LOL\x1b[5Clol', 1, 11, {}, 'OL lol'), + ('LOL\x1b[5Clol', 2, 11, {}, 'L lol'), + ('LOL\x1b[5Clol', 3, 11, {}, ' lol'), + ('LOL\x1b[5Clol', 4, 11, {}, ' lol'), + ('LOL\x1b[5Clol', 5, 11, {}, ' lol'), + ('LOL\x1b[5Clol', 6, 11, {}, ' lol'), + ('LOL\x1b[5Clol', 7, 11, {}, ' lol'), + ('LOL\x1b[5Clol', 8, 11, {}, 'lol'), + ('LOL\x1b[5Clol', 9, 11, {}, 'ol'), + # SGR + cursor movement: SGR state update in painter path (line 245) + ('\x1b[31mab\x1b[2Dcd', 0, 4, {}, '\x1b[31mcd\x1b[0m'), + # Tab tabsize=0 in painter path (line 272->280 else branch) + ('ab\x1b[2D\tcd', 0, 4, {'tabsize': 0}, '\tcd'), + # Zero-width grapheme outside clip window in painter (line 290->301) + ('\x1b[2D\u0301hello', 1, 4, {}, 'ell'), + # Wide char partially clipped in painter (lines 298-299) + ('ab\x1b[2D中d', 1, 4, {}, ' d'), + # walk_col >= end in painter reconstruction (327->328) + ('hello\x1b[2Dxy', 0, 3, {}, 'hel'), + # Hole fillchar in painter reconstruction (345->346) + ('\x1b[5Chi', 0, 7, {}, ' hi'), + # Trailing sequences stored at columns after col_limit (352, 354->355, 355->356) + ('abc\x1b[2D', 0, 2, {}, 'ab'), + # Bare ESC not part of any sequence, pass through in painter path (239->240) + ('a\x1bb\x1b[2Dc', 0, 3, {}, 'c\x1bb'), + # Tab with tabsize>0 in painter; `b` falls at col 4, inside (0,5) (277->284, 278->279, 278->280) + ('\x1b[2Da\tb', 0, 5, {'tabsize': 4}, 'a b'), + # propagate_sgr=False in painter path (225->226) + ('ab\x1b[2Dcd', 0, 4, {'propagate_sgr': False}, 'cd'), + # Non-SGR sequence before any visible text in painter (225->226 True) + ('\x1b]8;;http://x.com\x07ab\x1b[2Dcd', 0, 4, {}, '\x1b]8;;http://x.com\x07cd'), + # Bare ESC at end of text in painter (239->240) + ('ab\x1b[2D\x1b', 0, 2, {}, '\x1bab'), + # Wide char overwritten from right side (212 orphan fixup) + ('a中\x1b[Db', 0, 4, {}, 'a b'), + # Tab expansion with col+=1 not inside clip window (277->279, 293) + ('\x1b[2Ca\tb', 2, 4, {'tabsize': 8}, 'a '), + # CR: carriage return resets column to 0, overwriting earlier cells + ('aaa\r\r\rxxx', 0, 4, {}, 'xxx'), + ('abc\rXY', 0, 5, {}, 'XYc'), + ('hello\rworld', 0, 5, {}, 'world'), + # CR moves back to column 0 then writes within clip window + ('abc\rde', 1, 3, {}, 'ec'), + # BS: backspace overwrites previous character + ('abc\bde', 0, 5, {}, 'abde'), + ('abc\b\bXY', 0, 5, {}, 'aXY'), + ('ab\b\b\bXY', 0, 4, {}, 'XY'), + # HPA: horizontal position absolute (CSI n G) + ('abc\x1b[GXY', 0, 5, {}, 'XYc'), + ('abc\x1b[2GXY', 0, 5, {}, 'aXY'), + ('abc\x1b[5GXY', 0, 7, {}, 'abc XY'), + ('abc\x1b[5GXY', 0, 5, {}, 'abc X'), + ('\x1b[5GXY', 3, 7, {}, ' XY'), + # HPA no-param inside clip window + ('abc\x1b[GXY', 1, 4, {}, 'Yc'), ]) -def test_clip_cursor_sequences_expected_behaviour(text, start, end, expected): - """ - Verify clip() output matches terminal-visible columns after cursor moves. - - These tests capture the desired semantics: cursor-right creates blank cells (fillchar) in - the clipped output if the moved-to columns are within the clip window; cursor-left allows - subsequent characters to overwrite previous content and the clip should reflect that. - """ - assert repr(clip(text, start, end)) == repr(expected) +def test_clip_cursor_sequences_expected_behaviour(text, start, end, kwargs, expected): + """Verify clip() output matches terminal-visible columns after cursor moves.""" + result = clip(text, start, end, **kwargs) + assert repr(result) == repr(expected) \ No newline at end of file diff --git a/tests/test_width.py b/tests/test_width.py index a54d476..30a5903 100644 --- a/tests/test_width.py +++ b/tests/test_width.py @@ -46,8 +46,10 @@ def test_width_control_codes_ignore(text, expected, name): ('hello\x7fworld', 'DEL'), ('hello\x80world', 'C1_control'), ('hello\nworld', 'LF'), + ('hello\rworld', 'CR'), ('hello\x1b[Hworld', 'cursor_home'), ('hello\x1b[Aworld', 'cursor_up'), + ('hello\x1b[5Gworld', 'hpa'), ] @@ -62,9 +64,9 @@ def test_width_control_codes_strict_raises(text, name): ('hello\x07world', 10, 'BEL'), ('hello\x00world', 10, 'NUL'), ('abc\bd', 3, 'backspace'), - ('abc\rxy', 3, 'CR'), ('\x1b[31mred\x1b[0m', 3, 'SGR_sequence'), ('a\x1b[2Cb', 4, 'cursor_right'), + ('a\x1b[3Db', 1, 'cursor_left'), ('\x1b', 0, 'lone_ESC'), ('a\x1bb', 1, 'fs_sequence_between'), ('\x1b!', 1, 'ESC_unrecognized'), @@ -108,6 +110,11 @@ def test_width_strict_indeterminate_raises(seq, cap_name): ('abcd\x1b[2De', 4, 'cursor_left'), ('\x1b[31mred\x1b[0m', 3, 'SGR'), ('ab\x1b[Hcd', 4, 'indeterminate'), + ('def\x1b[3Dabc', 3, 'cursor_left_overwrite'), + ('def\x1b[10Dabc', 3, 'cursor_left_past_start'), + ('abc\x1b[5Gde', 6, 'hpa_parse'), + ('abc\x1b[Gde', 3, 'hpa_no_param'), + ('\x1b[5Gabc', 7, 'hpa_before_text'), ] @@ -268,6 +275,25 @@ def test_carriage_return_resets_column(): assert wcwidth.width('abc\rde') == 3 +def test_carriage_return_strict_raises(): + """CR in strict mode raises ValueError (indeterminate starting column).""" + with pytest.raises(ValueError, match='Horizontal movement'): + wcwidth.width('hello\rworld', control_codes='strict') + + +def test_hpa_parse_best_effort(): + """HPA in parse mode assumes string begins at column 0.""" + assert wcwidth.width('abc\x1b[5Gde') == 6 + assert wcwidth.width('abc\x1b[Gde') == 3 + assert wcwidth.width('\x1b[10Ghi') == 11 + + +def test_hpa_strict_raises(): + """HPA in strict mode raises ValueError (indeterminate starting column).""" + with pytest.raises(ValueError, match='horizontal position'): + wcwidth.width('abc\x1b[5Gde', control_codes='strict') + + def test_iter_sequences_lone_esc(): """Lone ESC is yielded as a sequence.""" assert list(wcwidth.iter_sequences('\x1b')) == [('\x1b', True)] diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index eed31dd..03030fb 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -99,12 +99,15 @@ def clip( return text[start:end] # Fast path: no escape sequences means no SGR tracking needed - if propagate_sgr and '\x1b' not in text: + has_esc = '\x1b' in text + if propagate_sgr and not has_esc: propagate_sgr = False - # Use painter's algorithm only when cursor movement (BS, CR, CSI C/D) can overwrite cells. - # Text without any horizontal movement uses a fast direct-append path. - use_painter = bool(_HORIZONTAL_CURSOR_MOVEMENT.search(text)) + # Use painter's algorithm only when cursor movement (BS, CR, CSI C/D) can overwrite + # previously emitted cells. Text without any horizontal movement uses the fast simple path. + # Use direct char checks to avoid regex scan overhead for the common (no-cursor) case. + use_painter = ('\x08' in text or '\r' in text or + (has_esc and bool(_HORIZONTAL_CURSOR_MOVEMENT.search(text)))) # SGR tracking state (only when propagate_sgr=True) sgr_at_clip_start is # sgr state when first visible char emitted (None = not yet) @@ -186,7 +189,7 @@ def clip( result = ''.join(output) else: - # Painter's algorithm path: handles cursor movement (BS, CR, CSI C/D) + # Painter's algorithm path: handles cursor movement (BS, CR, CSI C/D/G) # that can overwrite previously emitted cells. # map column integer to a visible character (with its width) @@ -202,16 +205,15 @@ def clip( def _write_cells(s: str, w: int, write_col: int) -> None: nonlocal sgr_at_clip_start - if w > 0: - # Fix up wide-char orphans and clear overwritten cells in one pass - for offset in range(w): - src_col = write_col + offset - if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: - cells[src_col - 1] = (fillchar, 1) - if cells.get(src_col, ('', 0))[1] == 2: - cells[src_col + 1] = (fillchar, 1) - cells.pop(src_col, None) - cells[write_col] = (s, w) + # Fix up wide-char orphans and clear overwritten cells in one pass + for offset in range(w): + src_col = write_col + offset + if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: + cells[src_col - 1] = (fillchar, 1) + if cells.get(src_col, ('', 0))[1] == 2: + cells[src_col + 1] = (fillchar, 1) + cells.pop(src_col, None) + cells[write_col] = (s, w) if propagate_sgr and sgr_at_clip_start is None: sgr_at_clip_start = sgr @@ -244,7 +246,13 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: idx = m.end() continue - # 1a. Cursor forward, + # 1a. HPA: horizontal position absolute (CSI n G) + if (hpa_n := m.group('hpa_n')) is not None: + col = int(hpa_n) - 1 if hpa_n else 0 + idx = m.end() + continue + + # 1b. Cursor forward, if (cforward_n := m.group('cforward_n')) is not None: n_forward = int(cforward_n) if cforward_n else 1 move_end = col + n_forward @@ -255,19 +263,33 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: idx = m.end() continue - # 1b. Cursor backward, + # 1c. Cursor backward, if (cbackward_n := m.group('cbackward_n')) is not None: n_backward = int(cbackward_n) if cbackward_n else 1 col = max(0, col - n_backward) idx = m.end() continue - # 1c. Any other recognized zero-width sequence + # 1d. Any other recognized zero-width sequence _append_seq(m.group()) idx = m.end() continue - # 2. TAB expansion + # 2. Carriage return and backspace (before TAB/grapheme fallthrough) + if char == '\r': + # CR: reset column to 0 + col = 0 + idx += 1 + continue + + if char == '\x08': + # BS: decrement column + if col > 0: + col -= 1 + idx += 1 + continue + + # 3. TAB expansion if char == '\t': if tabsize > 0: next_tab = col + (tabsize - (col % tabsize)) @@ -281,7 +303,7 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: idx += 1 continue - # 3. Grapheme clustering for everything else + # 4. Grapheme clustering for everything else grapheme = next(iter_graphemes(text, start=idx)) grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) @@ -361,4 +383,4 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: if _sgr_state_is_active(sgr_at_clip_start): result += '\x1b[0m' - return result \ No newline at end of file + return result diff --git a/wcwidth/_width.py b/wcwidth/_width.py index ca0d7be..a15ff73 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -16,6 +16,7 @@ from .table_grapheme import ISC_CONSONANT from .escape_sequences import (_SEQUENCE_CLASSIFY, CURSOR_MOVEMENT_SEQUENCE, + CURSOR_HPA_SEQUENCE, INDETERMINATE_EFFECT_SEQUENCE, strip_sequences) @@ -159,12 +160,22 @@ def width( if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): raise ValueError(f"Indeterminate cursor sequence at position {idx}, {seq!r}") - # 2b. cursor forward, backward - if (cforward_n := m.group('cforward_n')) is not None: + # 2b. horizontal position absolute (before forward/backward to + # avoid other_seq match in _SEQUENCE_CLASSIFY) + if (hpa_n := m.group('hpa_n')) is not None: + target_col = int(hpa_n) if hpa_n else 1 + if strict: + raise ValueError( + f"Indeterminate horizontal position at position {idx}, " + f"{seq!r} (absolute column unknown)" + ) + current_col = target_col - 1 # HPA is 1-indexed, convert to 0-indexed + # 2c. cursor forward, backward + elif (cforward_n := m.group('cforward_n')) is not None: current_col += int(cforward_n) if cforward_n else 1 elif (cbackward_n := m.group('cbackward_n')) is not None: current_col = max(0, current_col - (int(cbackward_n) if cbackward_n else 1)) - # 2c. SGR and other zero-width sequences -- no column advance + # 2d. SGR and other zero-width sequences -- no column advance idx = m.end() max_extent = max(max_extent, current_col) continue @@ -190,6 +201,11 @@ def width( if current_col > 0: current_col -= 1 elif char == '\x0d': # Carriage return + if strict: + raise ValueError( + f"Horizontal movement character \\r at position {idx}: " + "indeterminate starting column" + ) current_col = 0 max_extent = max(max_extent, current_col) idx += 1 diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index 7d5c7cb..afa8c43 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -45,21 +45,25 @@ # Cursor left movement: CSI [n] D, parameter may be parsed by width() CURSOR_LEFT_SEQUENCE = re.compile(r'\x1b\[(\d*)D') +# Horizontal position absolute: CSI [n] G, parameter may be parsed by width() +CURSOR_HPA_SEQUENCE = re.compile(r'\x1b\[(\d*)G') + # Combined cursor movement: single regex for fast-path detection of any -# horizontal cursor movement (left or right). Avoids two separate search() +# horizontal cursor movement (left, right, hpa). Avoids two separate search() # calls in hot-path width() and clip() pre-checks. -CURSOR_MOVEMENT_SEQUENCE = re.compile(r'\x1b\[(\d*)[CD]') +CURSOR_MOVEMENT_SEQUENCE = re.compile(r'\x1b\[(\d*)[CDG]') -# Combined horizontal cursor movement: matches BS, CR, and CSI C/D cursor sequences +# Combined horizontal cursor movement: matches BS, CR, and CSI C/D/G cursor sequences # in a single regex pass. Used by clip() to decide between the simple append path # and the painter's algorithm. -_HORIZONTAL_CURSOR_MOVEMENT = re.compile(r'[\x08\r]|\x1b\[(\d*)[CD]') +_HORIZONTAL_CURSOR_MOVEMENT = re.compile(r'[\x08\r]|\x1b\[(\d*)[CDG]') # Combined pattern: a single regex that matches any zero-width escape sequence # and classifies it via named groups, aprox 2x faster than redundant re.matches # in clip() and width(). _SEQUENCE_CLASSIFY = re.compile( _SGR_PATTERN.pattern.replace('(', '(?P', 1) + + '|' + CURSOR_HPA_SEQUENCE.pattern.replace('(', '(?P', 1) + '|' + CURSOR_RIGHT_SEQUENCE.pattern.replace('(', '(?P', 1) + '|' + CURSOR_LEFT_SEQUENCE.pattern.replace('(', '(?P', 1) + '|' + r'(?P(?:' + ZERO_WIDTH_PATTERN.pattern + '))' @@ -77,7 +81,6 @@ r'\x1b\[\d+;\d+r', # change_scroll_region r'\x1b\[\d*K', # erase_in_line (clr_eol, clr_bol) r'\x1b\[\d*J', # erase_in_display (clr_eos, erase_display) - r'\x1b\[\d*G', # column_address r'\x1b\[\d+;\d+H', # cursor_address r'\x1b\[\d*H', # cursor_home r'\x1b\[\d*A', # cursor_up From c38dddfcc7e81d800e770e15166b652fd3f1adc7 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 13:58:32 -0400 Subject: [PATCH 48/70] checkpoint, 100% coverage but hyperlinks todo --- docs/intro.rst | 43 +++++++++++++++++++++++++------------- tests/test_clip_cursors.py | 4 ++++ wcwidth/_clip.py | 10 ++------- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/docs/intro.rst b/docs/intro.rst index 2e7b3c4..e76d3f4 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -1,3 +1,4 @@ + |pypi_downloads| |codecov| |license| ============ @@ -35,28 +36,39 @@ Some examples of **incorrect results**: Solution -------- -The lowest-level functions in this library are the POSIX.1-2001 and POSIX.1-2008 `wcwidth(3)`_ and -`wcswidth(3)`_, which this library precisely copies by interface as `wcwidth()`_ and `wcswidth()`_. -These functions return -1 when C0 and C1 control codes are present. +The lowest-level functions in this library are derived from POSIX.1-2001 and POSIX.1-2008 +`wcwidth(3)`_ and `wcswidth(3)`_, which this library precisely copies by interface as `wcwidth()`_ +and `wcswidth()`_. These functions return -1 when C0 and C1 control codes are present. An easy-to-use `width()`_ function is provided as a wrapper of `wcswidth()`_ that is also capable of measuring most terminal control codes and sequences, like colors, bold, tabstops, and horizontal cursor movement. -Text-justification is solved by the grapheme and sequence-aware functions `ljust()`_, -`rjust()`_, `center()`_, and `wrap()`_, serving as drop-in replacements to python standard functions -of the same names. +Text-justification is solved by the thesequence-aware functions `ljust()`_, +`rjust()`_, `center()`_, and and grapheme-aware function `wrap()`_, serving as drop-in replacements +to python standard functions. + +The `clip()`_ function extracts substrings by their displayed column positions, and +`strip_sequences()`_ removes terminal escape sequences from text altogether. The iterator functions `iter_graphemes()`_ and `iter_sequences()`_ allow for careful navigation of -grapheme and terminal control sequence boundaries. `iter_graphemes_reverse()`_, and -`grapheme_boundary_before()`_ are useful for editing and searching of complex unicode. The -`clip()`_ function extracts substrings by display column positions, and `strip_sequences()`_ removes -terminal escape sequences from text altogether. +grapheme and terminal control sequence boundaries as required by editors or REPLs with cursor +control. `iter_graphemes_reverse()`_, and `grapheme_boundary_before()`_ are often necessary for +backward cursor control over complex unicode. Discrepancies ------------- -You may find that support *varies* for complex unicode sequences or codepoints. +You may find that support *varies* for complex unicode sequences or codepoints. This library may be +considered to presume the terminal is enabled for DEC Private Mode 2027 ("Grapheme Clustering"), but +that specification does not fully describe support of varying unicode versions, feature levels, or +make any interpretation of standards for all languages or complex scripts. See `Grapheme Clusters +and Terminal Emulators`_ and `terminal-unicode-core.tex`_ for more details about testing and +enabling mode 2027 support. + +This library takes a progressive approach of interpretation of standards (specification_), and +where interpretation is unclear, to match popular modern terminals. This library does *not* support +any alternate "legacy width" measurement. A companion utility, `jquast/ucs-detect`_ was authored to gather and publish the results of Wide character, language/grapheme clustering and complex script support, emojis and zero-width joiner, @@ -263,9 +275,9 @@ Use `clip()`_ to extract a substring by column positions, preserving terminal se >>> clip('\x1b[1;31mHello world\x1b[0m', 6, 11) '\x1b[1;31mworld\x1b[0m' - >>> # Disable SGR propagation to preserve original sequences as-is - >>> clip('\x1b[31m中文\x1b[0m', 0, 3, propagate_sgr=False) - '\x1b[31m中 \x1b[0m' + >>> # Disable SGR propagation to preserve sequence order outside of clip boundary + >>> clip('\x1b[31m中文\x1b[32m', 0, 3, propagate_sgr=False) + '\x1b[31m中 \x1b[32m' strip_sequences() ----------------- @@ -758,6 +770,9 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c:: .. _`Terminal.detect_ambiguous_width()`: https://blessed.readthedocs.io/en/latest/api/terminal.html#blessed.terminal.Terminal.detect_ambiguous_width .. _`parity padding`: https://jazcap53.github.io/pythons-eccentric-strcenter.html .. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ +.. _`Grapheme Clusters and Terminal Emulators`: https://mitchellh.com/writing/grapheme-clusters-in-terminals +.. _`terminal-unicode-core.tex`: https://github.com/contour-terminal/terminal-unicode-core/blob/master/spec/terminal-unicode-core.tex + .. |pypi_downloads| image:: https://img.shields.io/pypi/dm/wcwidth.svg?logo=pypi :alt: Downloads :target: https://pypi.org/project/wcwidth/ diff --git a/tests/test_clip_cursors.py b/tests/test_clip_cursors.py index 5b9f716..221b28e 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_cursors.py @@ -93,6 +93,10 @@ ('\x1b[5GXY', 3, 7, {}, ' XY'), # HPA no-param inside clip window ('abc\x1b[GXY', 1, 4, {}, 'Yc'), + # walk_col >= end with sequences at column == end (line 351) + ('\x1b[5C\x1b]8;;http://x.com\x07', 0, 5, {'propagate_sgr': False}, ' \x1b]8;;http://x.com\x07'), + # Trailing sequences past col_limit (line 374) + ('\x1b[5C\x1b]8;;http://x.com\x07', 0, 3, {'propagate_sgr': False}, ' \x1b]8;;http://x.com\x07'), ]) def test_clip_cursor_sequences_expected_behaviour(text, start, end, kwargs, expected): """Verify clip() output matches terminal-visible columns after cursor moves.""" diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index 03030fb..01b0beb 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -352,14 +352,8 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: if walk_col in cells: cell_text, cell_w = cells[walk_col] - cell_end = walk_col + cell_w - - if walk_col >= start and cell_end <= end: - # Fully inside clip window - parts.append(cell_text) - elif cell_end > start: - # Partial overlap (wide char split at boundary) - parts.append(fillchar * (min(cell_end, end) - max(walk_col, start))) + # All cells satisfy walk_col >= start and walk_col + cell_w <= end + parts.append(cell_text) walk_col += cell_w else: # Hole: emit fillchar for columns inside (start, end) that lie From 1c60bd38974ba9ad8f00b32617a0272cb0198fb1 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 16:10:36 -0400 Subject: [PATCH 49/70] clip() is getting a bit long in the tooth! --- docs/intro.rst | 48 ++++++-- tests/test_clip.py | 114 ++++++++++++++++++ tests/test_clip_cursors.py | 2 +- wcwidth/_clip.py | 239 ++++++++++++++++++++++++++++++++++--- wcwidth/_width.py | 1 - 5 files changed, 373 insertions(+), 31 deletions(-) diff --git a/docs/intro.rst b/docs/intro.rst index e76d3f4..acfb242 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -130,33 +130,55 @@ Use function `width()`_ to measure a string with improved handling of ``control_ >>> # same support as wcswidth(), eg. regional indicator flag: >>> wcwidth.width('\U0001F1FF\U0001F1FC') 2 - >>> # but also supports SGR colored text, 'WARN', followed by SGR reset + >>> # but also supports sequences, like SGR colored text, "WARN", followed by reset >>> wcwidth.width('\x1b[38;2;255;150;100mWARN\x1b[0m') 4 - >>> # tabs, + >>> # tabs are measured as though the string begins at a tabstop, >>> wcwidth.width('\t', tabsize=4) 4 - >>> # or, tab and all other control characters can be ignored - >>> wcwidth.width('\t', control_codes='ignore') - 0 - >>> # "vertical" control characters are ignored - >>> wcwidth.width('\n') + >>> # Kitty text sizing protocol (OSC 66) are measured, eg. 2x-scaled "Hello" + >>> wcwidth.width('\x1b]66;s=2;Hello\x07') + 10 + >>> # or, all control characters can be ignored (including tab) + >>> wcwidth.width('\t\n\a\r', control_codes='ignore') 0 - >>> # as well as sequences with "indeterminate" effects like Home + Clear + >>> # sequences with "indeterminate" effects like Home + Clear are zero-width >>> wcwidth.width('\x1b[H\x1b[2J') 0 - >>> # Kitty text sizing protocol (OSC 66): 2x-scaled "Hello" occupies 10 cells - >>> wcwidth.width('\x1b]66;s=2;Hello\x07') + >>> # horizontal cursor movements are parsed, + >>> wcwidth.width('hello\b\b\b\b\bworld') + 5 + >>> wcwidth.width('hello\x1b[5Dworld') + 5 + >>> # or ignored, + >>> wcwidth.width('hello\x1b[5Dworld', control_codes='ignore') 10 + +Use ``control_codes='ignore'`` when the input is known not to contain any control characters or +terminal sequences for slightly improved performance. Note that TAB (``'\t'``) is a control +character and is also ignored, you may want to use `str.expandtabs()`_, first. + +Use ``control_codes='strict'`` when input is known to contain some control sequences, such as +SGR color, bold, hyperlinks and cursor movement. Any sequence that cannot be accurately parsed, +such as clearing the screen, vertical, or absolute cursor movement will raise ``ValueError``: + +.. code-block:: python + >>> # or, raise ValueError for "indeterminate" effects using control_codes='strict' >>> wcwidth.width('\n', control_codes='strict') Traceback (most recent call last): ... ValueError: Vertical movement character 0xa at position 0 -Use ``control_codes='ignore'`` when the input is known not to contain any control characters or -terminal sequences for slightly improved performance. Note that TAB (``'\t'``) is a control -character and is also ignored, you may want to use `str.expandtabs()`_, first. + + >>> wcwidth.width('\x1b[H\x1b[2J', control_codes='strict') + Traceback (most recent call last): + ... + ValueError: Indeterminate cursor sequence at position 0, '\x1b[H' + +TODO: should raise ValueError (out of bounds), +>>> wcwidth.width('a\x1b[5Da', control_codes='strict') + iter_sequences() ---------------- diff --git a/tests/test_clip.py b/tests/test_clip.py index 0a201cf..04fa520 100644 --- a/tests/test_clip.py +++ b/tests/test_clip.py @@ -138,6 +138,120 @@ def test_clip_sequences_osc_hyperlink(): ) +# ── OSC 8 hyperlink clipping ────────────────────────────────────────────────── + +OSC_START_BEL = '\x1b]8;;http://example.com\x07' +OSC_END_BEL = '\x1b]8;;\x07' +OSC_START_ST = '\x1b]8;;http://example.com\x1b\\' +OSC_END_ST = '\x1b]8;;\x1b\\' + + +CLIP_HYPERLINK_CASES = [ + # Full hyperlink visible — preserved as-is + (f'{OSC_START_BEL}link{OSC_END_BEL}', 0, 4, + f'{OSC_START_BEL}link{OSC_END_BEL}'), + # Clipping middle of hyperlink text — rebuild around clipped inner text + (f'{OSC_START_BEL}Click This link{OSC_END_BEL}', 6, 10, + f'{OSC_START_BEL}This{OSC_END_BEL}'), + # Clipping from start — only first portion + (f'{OSC_START_BEL}Click This{OSC_END_BEL}', 0, 5, + f'{OSC_START_BEL}Click{OSC_END_BEL}'), + # Clipping from end — only last portion + (f'{OSC_START_BEL}Click This{OSC_END_BEL}', 6, 10, + f'{OSC_START_BEL}This{OSC_END_BEL}'), + # Hyperlink entirely before clip window — dropped + (f'{OSC_START_BEL}link{OSC_END_BEL}world', 0, 4, + f'{OSC_START_BEL}link{OSC_END_BEL}'), + # Hyperlink entirely after clip window — dropped + (f'hello{OSC_START_BEL}link{OSC_END_BEL}', 0, 5, 'hello'), + # Hyperlink clipped to nothing — empty hyperlink dropped + (f'{OSC_START_BEL}link{OSC_END_BEL}', 5, 10, ''), + # Empty hyperlink (no inner text) — dropped + (f'before{OSC_START_BEL}{OSC_END_BEL}after', 0, 11, 'beforeafter'), + # Hyperlink with CJK text clipped + (f'{OSC_START_BEL}中文文字{OSC_END_BEL}', 0, 4, + f'{OSC_START_BEL}中文{OSC_END_BEL}'), + # Hyperlink with CJK text clipped at odd column + (f'{OSC_START_BEL}中文文字{OSC_END_BEL}', 0, 3, + f'{OSC_START_BEL}中 {OSC_END_BEL}'), + # Hyperlink with ST terminator + (f'{OSC_START_ST}Click This{OSC_END_ST}', 0, 5, + f'{OSC_START_ST}Click{OSC_END_ST}'), + # Multiple non-overlapping hyperlinks + (f'{OSC_START_BEL}ab{OSC_END_BEL} {OSC_START_ST}cd{OSC_END_ST}', 0, 5, + f'{OSC_START_BEL}ab{OSC_END_BEL} {OSC_START_ST}cd{OSC_END_ST}'), + # Hyperlink with params preserved + ('\x1b]8;id=myid;http://x.com\x07link\x1b]8;;\x07', 1, 3, + '\x1b]8;id=myid;http://x.com\x07in\x1b]8;;\x07'), + # Hyperlink text before clip window, hyperlink within + (f'before{OSC_START_BEL}link{OSC_END_BEL}', 6, 10, + f'{OSC_START_BEL}link{OSC_END_BEL}'), + # SGR inside hyperlink is preserved + (f'{OSC_START_BEL}\x1b[31mred link\x1b[0m{OSC_END_BEL}', 4, 8, + f'{OSC_START_BEL}\x1b[31mlink\x1b[0m{OSC_END_BEL}'), + # Hyperlink open without matching close — preserved as regular sequence + ('\x1b]8;;http://x.com\x07link', 0, 4, '\x1b]8;;http://x.com\x07link'), + # Nested hyperlinks + ('\x1b]8;;a\x07ABCD \x1b]8;;b\x07XY\x1b]8;;\x07 EF\x1b]8;;\x07', 0, 14, + '\x1b]8;;a\x07ABCD \x1b]8;;b\x07XY\x1b]8;;\x07 EF\x1b]8;;\x07'), + # Bare ESC between hyperlink markers + ('\x1b]8;;url\x07ab\x1bxcd\x1b]8;;\x07', 0, 6, + '\x1b]8;;url\x07ab\x1bxcd\x1b]8;;\x07'), +] + + +@pytest.mark.parametrize('text,start,end,expected', CLIP_HYPERLINK_CASES) +def test_clip_osc_hyperlink_text_clipping(text, start, end, expected): + """OSC 8 hyperlink inner text is clipped and hyperlink rebuilt.""" + assert repr(clip(text, start, end)) == repr(expected) + + +# control_codes variants with cursor movement into hyperlink +_HLINK_OVERWRITE = f'{OSC_START_BEL}link{OSC_END_BEL}\x1b[2Dxy' +CLIP_HYPERLINK_CONTROL_CODES_CASES = [ + ('parse', 0, 4, f'{OSC_START_BEL}link{OSC_END_BEL}'), + ('ignore', 0, 6, f'{OSC_START_BEL}link{OSC_END_BEL}\x1b[2Dxy'), +] + + +@pytest.mark.parametrize('control_codes,start,end,expected', + CLIP_HYPERLINK_CONTROL_CODES_CASES) +def test_clip_hyperlink_control_codes_overwrite(control_codes, start, end, expected): + assert repr(clip(_HLINK_OVERWRITE, start, end, control_codes=control_codes)) == repr(expected) + + +def test_clip_osc_hyperlink_strict_raises(): + """control_codes='strict' raises ValueError when overwriting hyperlink cells.""" + with pytest.raises(ValueError, match='OSC 8 hyperlink'): + clip(_HLINK_OVERWRITE, 0, 4, control_codes='strict') + + +# Painter-path hyperlink edge cases +CLIP_HYPERLINK_PAINTER_CASES = [ + # Empty hyperlink dropped + (f'\x1b[2D{OSC_START_BEL}{OSC_END_BEL}xy', 'parse', 0, 4, 'xy'), + # Hyperlink entirely after clip window — skipped + (f'\x1b[2Dab{OSC_START_BEL}cde{OSC_END_BEL}', 'parse', 0, 2, 'ab'), + # Hyperlink entirely before clip window — skipped + (f'{OSC_START_BEL}ab{OSC_END_BEL}\x1b[2Dcdef', 'parse', 2, 4, 'ef'), + # Hyperlink overlapping clip window — clipped + (f'\x1b[2D{OSC_START_BEL}abcdef{OSC_END_BEL}', 'parse', 0, 3, + f'{OSC_START_BEL}abc{OSC_END_BEL}'), + # Bare ESC inside hyperlink in painter path + (f'\x1b[2D{OSC_START_BEL}a\x1bb{OSC_END_BEL}', 'parse', 0, 4, + f'{OSC_START_BEL}a\x1bb{OSC_END_BEL}'), + # strict mode: non-hyperlink cells don't overlap hyperlink_cells + (f'{OSC_START_BEL}link{OSC_END_BEL}\x1b[5Chi', 'strict', 0, 11, + f'{OSC_START_BEL}link{OSC_END_BEL} hi'), +] + + +@pytest.mark.parametrize('text,control_codes,start,end,expected', + CLIP_HYPERLINK_PAINTER_CASES) +def test_clip_hyperlink_painter_cases(text, control_codes, start, end, expected): + assert repr(clip(text, start, end, control_codes=control_codes)) == repr(expected) + + def test_clip_sequences_cjk_with_sequences(): assert clip('\x1b[31m中文\x1b[0m', 0, 3) == '\x1b[31m中 \x1b[0m' diff --git a/tests/test_clip_cursors.py b/tests/test_clip_cursors.py index 221b28e..eb1c635 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_cursors.py @@ -101,4 +101,4 @@ def test_clip_cursor_sequences_expected_behaviour(text, start, end, kwargs, expected): """Verify clip() output matches terminal-visible columns after cursor moves.""" result = clip(text, start, end, **kwargs) - assert repr(result) == repr(expected) \ No newline at end of file + assert repr(result) == repr(expected) diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index 01b0beb..d8635d1 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -1,7 +1,8 @@ """This is a python implementation of clip().""" # std imports +import re -from typing import Optional, NamedTuple +from typing import Literal, Optional, NamedTuple # local from ._width import width @@ -12,19 +13,58 @@ _sgr_state_to_sequence) from .escape_sequences import _SEQUENCE_CLASSIFY, _HORIZONTAL_CURSOR_MOVEMENT +# OSC 8 hyperlink parsing (mirrors textwrap.py to avoid circular import) +_HYPERLINK_OPEN_RE = re.compile(r'\x1b]8;([^;]*);([^\x07\x1b]*)(\x07|\x1b\\)') +_HYPERLINK_CLOSE_RE = re.compile(r'\x1b]8;;(?:\x07|\x1b\\)') -class VisToken(NamedTuple): - """A visible text segment with its display width and starting column.""" - text: str - width: int - start_col: int +class _HyperlinkState(NamedTuple): + """Open OSC 8 hyperlink: url, params, terminator (BEL or ST).""" + url: str + params: str + terminator: str -class SeqToken(NamedTuple): - """A zero-width terminal sequence (escape sequences, control chars, etc.).""" - text: str +def _parse_hyperlink_open(seq: str) -> Optional[_HyperlinkState]: + if (m := _HYPERLINK_OPEN_RE.match(seq)): + return _HyperlinkState(url=m.group(2), params=m.group(1), terminator=m.group(3)) + return None + + +def _make_hyperlink_open(state: _HyperlinkState) -> str: + return f'\x1b]8;{state.params};{state.url}{state.terminator}' + + +def _make_hyperlink_close(terminator: str) -> str: + return f'\x1b]8;;{terminator}' + + +def _find_hyperlink_close(text: str, open_end: int) -> Optional[tuple[int, int]]: + """ + Find matching OSC 8 close, handling nesting. + + Returns (start, end) or None. + """ + depth = 1 + idx = open_end + while idx < len(text): + if text[idx] != '\x1b': + idx += 1 + continue + m = _SEQUENCE_CLASSIFY.match(text, idx) + if not m: + idx += 1 + continue + seq = m.group() + if _HYPERLINK_CLOSE_RE.match(seq): + depth -= 1 + if depth == 0: + return (idx, m.end()) + elif _parse_hyperlink_open(seq): + depth += 1 + idx = m.end() + return None def clip( @@ -36,6 +76,7 @@ def clip( tabsize: int = 8, ambiguous_width: int = 1, propagate_sgr: bool = True, + control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', ) -> str: r""" Clip text to display columns ``(start, end)`` while preserving all terminal sequences. @@ -51,6 +92,14 @@ def clip( Other cursor movement characters (backspace, carriage return) and cursor movement sequences are passed through unchanged as zero-width. + **OSC 8 hyperlinks** are handled specially: the visible text inside a hyperlink + is clipped to the requested column range, and the hyperlink is rebuilt around + the clipped text. Empty hyperlinks (those with no remaining visible text after + clipping) are removed:: + + >>> clip('\x1b]8;;http://example.com\x07Click This link\x1b]8;;\x07', 6, 10) + '\x1b]8;;http://example.com\x07This\x1b]8;;\x07' + :param text: String to clip, may contain terminal escape sequences. :param start: Absolute starting column (inclusive, 0-indexed). :param end: Absolute ending column (exclusive). @@ -63,10 +112,24 @@ def clip( :param propagate_sgr: If True (default), SGR (terminal styling) sequences are propagated. The result begins with any active style at the start position and ends with a reset sequence if styles are active. + :param control_codes: How to handle control characters and sequences: + + - ``'parse'`` (default): Track horizontal cursor movement and clip + hyperlink text. Cursor overwrite of hyperlink cells is allowed + (the hyperlink open/close are preserved as sequences). + - ``'strict'``: Like ``parse``, but raises :exc:`ValueError` when a + cursor movement would overwrite a cell that is part of an OSC 8 + hyperlink, as this produces indeterminate results on real terminals. + - ``'ignore'``: All control characters are treated as zero-width. + Cursor movement is not tracked (fastest path). + :returns: Substring of ``text`` spanning display columns ``(start, end)``, with all terminal sequences preserved and wide characters at boundaries replaced with ``fillchar``. + :raises ValueError: If ``control_codes='strict'`` and a cursor movement + would overwrite a cell that was emitted as part of an OSC 8 hyperlink. + SGR (terminal styling) sequences are propagated by default. The result begins with any active style and ends with a reset:: @@ -80,6 +143,9 @@ def clip( .. versionchanged:: 0.5.0 Added ``propagate_sgr`` parameter (default True). + .. versionchanged:: 0.7.0 + Added ``control_codes`` parameter and OSC 8 hyperlink-aware clipping. + Example:: >>> clip('hello world', 0, 5) @@ -94,6 +160,8 @@ def clip( if end <= start: return '' + strict = control_codes == 'strict' + # Fast path: printable ASCII only (no tabs, escape sequences, or wide or zero-width chars) if text.isascii() and text.isprintable(): return text[start:end] @@ -106,8 +174,11 @@ def clip( # Use painter's algorithm only when cursor movement (BS, CR, CSI C/D) can overwrite # previously emitted cells. Text without any horizontal movement uses the fast simple path. # Use direct char checks to avoid regex scan overhead for the common (no-cursor) case. - use_painter = ('\x08' in text or '\r' in text or - (has_esc and bool(_HORIZONTAL_CURSOR_MOVEMENT.search(text)))) + use_painter = ( + control_codes != 'ignore' and + ('\x08' in text or '\r' in text or + (has_esc and bool(_HORIZONTAL_CURSOR_MOVEMENT.search(text)))) + ) # SGR tracking state (only when propagate_sgr=True) sgr_at_clip_start is # sgr state when first visible char emitted (None = not yet) @@ -139,14 +210,69 @@ def clip( idx += 1 continue + seq = m.group() + # SGR handling: update state, don't emit sequence if m.group('sgr_params') is not None and propagate_sgr and sgr: - sgr = _sgr_state_update(sgr, m.group()) + sgr = _sgr_state_update(sgr, seq) idx = m.end() continue + # OSC 8 hyperlink open: process as a unit (recursively clip inner text) + if (hl_state := _parse_hyperlink_open(seq)): + close_span = _find_hyperlink_close(text, m.end()) + if close_span is None: + # No matching close: treat as regular zero-width sequence + output.append(seq) + idx = m.end() + continue + + close_start, close_end = close_span + inner_text = text[m.end():close_start] + inner_width = width( + inner_text, control_codes=control_codes, + tabsize=tabsize, ambiguous_width=ambiguous_width, + ) + + if inner_width == 0: + # Empty hyperlink: drop entirely + idx = close_end + continue + + # Determine if hyperlink column range overlaps clip window + hl_col_start = col + hl_col_end = col + inner_width + + if hl_col_end <= start or hl_col_start >= end: + # Hyperlink entirely outside clip window: skip it + col += inner_width + idx = close_end + continue + + # Hyperlink overlaps clip window: recursively clip inner text + inner_clip_start = max(0, start - col) + inner_clip_end = end - col + + clipped_inner = clip( + inner_text, inner_clip_start, inner_clip_end, + fillchar=fillchar, tabsize=tabsize, + ambiguous_width=ambiguous_width, + propagate_sgr=False, + control_codes=control_codes, + ) + + output.append(_make_hyperlink_open(hl_state)) + output.append(clipped_inner) + output.append(_make_hyperlink_close(hl_state.terminator)) + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + + col += inner_width + idx = close_end + continue + # Any other recognized sequence preserved as-is - output.append(m.group()) + output.append(seq) idx = m.end() continue @@ -194,6 +320,8 @@ def clip( # map column integer to a visible character (with its width) cells: dict[int, tuple[str, int]] = {} + # set of column positions belonging to hyperlink visible cells (for strict mode) + hyperlink_cells: set[int] = set() # map column integer to a list of zero-width sequences emitted at that position # (col, seq_order, text) sequences: list[tuple[int, int, str]] = [] @@ -203,17 +331,33 @@ def clip( col = 0 idx = 0 - def _write_cells(s: str, w: int, write_col: int) -> None: + def _write_cells(s: str, w: int, write_col: int, + is_hyperlink: bool = False) -> None: nonlocal sgr_at_clip_start + # Strict-mode check: overwriting hyperlink cells is indeterminate + if strict and not is_hyperlink: + for offset in range(w): + if write_col + offset in hyperlink_cells: + raise ValueError( + f"Cursor movement at column {write_col + offset} " + f"would overwrite an OSC 8 hyperlink cell. " + f"Use control_codes='parse' to allow this." + ) # Fix up wide-char orphans and clear overwritten cells in one pass for offset in range(w): src_col = write_col + offset if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: cells[src_col - 1] = (fillchar, 1) + hyperlink_cells.discard(src_col - 1) if cells.get(src_col, ('', 0))[1] == 2: cells[src_col + 1] = (fillchar, 1) + hyperlink_cells.discard(src_col + 1) cells.pop(src_col, None) + hyperlink_cells.discard(src_col) cells[write_col] = (s, w) + if is_hyperlink: + for offset in range(w): + hyperlink_cells.add(write_col + offset) if propagate_sgr and sgr_at_clip_start is None: sgr_at_clip_start = sgr @@ -240,12 +384,75 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: idx += 1 continue + seq = m.group() + # Dispatch on which named group captured: if (m.group('sgr_params')) is not None and (propagate_sgr and sgr): - sgr = _sgr_state_update(sgr, m.group()) + sgr = _sgr_state_update(sgr, seq) idx = m.end() continue + # OSC 8 hyperlink open: process as a unit (recursively clip inner text) + if (hl_state := _parse_hyperlink_open(seq)): + close_span = _find_hyperlink_close(text, m.end()) + if close_span is None: + # No matching close: treat as regular sequence + _append_seq(seq) + idx = m.end() + continue + + close_start, close_end = close_span + inner_text = text[m.end():close_start] + inner_width = width( + inner_text, control_codes=control_codes, + tabsize=tabsize, ambiguous_width=ambiguous_width, + ) + + if inner_width == 0: + # Empty hyperlink: drop entirely + idx = close_end + continue + + # Determine if hyperlink column range overlaps clip window + hl_col_start = col + hl_col_end = col + inner_width + + if hl_col_end <= start or hl_col_start >= end: + # Hyperlink entirely outside clip window: skip it + col += inner_width + idx = close_end + continue + + # Hyperlink overlaps clip window: recursively clip inner text + inner_clip_start = max(0, start - col) + inner_clip_end = end - col + + clipped_inner = clip( + inner_text, inner_clip_start, inner_clip_end, + fillchar=fillchar, tabsize=tabsize, + ambiguous_width=ambiguous_width, + propagate_sgr=False, + control_codes=control_codes, + ) + + # Emit hyperlink open as sequence, then clipped cells + _append_seq(_make_hyperlink_open(hl_state)) + inner_clipped_width = width( + clipped_inner, control_codes=control_codes, + tabsize=tabsize, ambiguous_width=ambiguous_width, + ) + _write_cells(clipped_inner, inner_clipped_width, col, + is_hyperlink=True) + col += inner_clipped_width + # Emit hyperlink close as sequence after the cells + _append_seq(_make_hyperlink_close(hl_state.terminator), + at_col=col) + + # Advance past the original hyperlink content + col = hl_col_end + idx = close_end + continue + # 1a. HPA: horizontal position absolute (CSI n G) if (hpa_n := m.group('hpa_n')) is not None: col = int(hpa_n) - 1 if hpa_n else 0 @@ -271,7 +478,7 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: continue # 1d. Any other recognized zero-width sequence - _append_seq(m.group()) + _append_seq(seq) idx = m.end() continue diff --git a/wcwidth/_width.py b/wcwidth/_width.py index a15ff73..4ad6531 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -16,7 +16,6 @@ from .table_grapheme import ISC_CONSONANT from .escape_sequences import (_SEQUENCE_CLASSIFY, CURSOR_MOVEMENT_SEQUENCE, - CURSOR_HPA_SEQUENCE, INDETERMINATE_EFFECT_SEQUENCE, strip_sequences) From 4ede8196fa8044600a92100124bb5f175dd82064 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 16:53:13 -0400 Subject: [PATCH 50/70] last rewrite? --- docs/intro.rst | 58 ++-- tests/test_benchmarks.py | 2 +- tests/test_clip.py | 26 +- tests/test_clip_cursors.py | 18 ++ tests/test_width.py | 19 +- wcwidth/_clip.py | 638 ++++++++++++++++++------------------- wcwidth/_wcswidth.py | 2 +- wcwidth/_width.py | 16 +- wcwidth/textwrap.py | 2 +- 9 files changed, 418 insertions(+), 363 deletions(-) diff --git a/docs/intro.rst b/docs/intro.rst index acfb242..3c91c10 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -44,9 +44,9 @@ An easy-to-use `width()`_ function is provided as a wrapper of `wcswidth()`_ tha measuring most terminal control codes and sequences, like colors, bold, tabstops, and horizontal cursor movement. -Text-justification is solved by the thesequence-aware functions `ljust()`_, -`rjust()`_, `center()`_, and and grapheme-aware function `wrap()`_, serving as drop-in replacements -to python standard functions. +Text-justification is solved by the sequence-aware functions `ljust()`_, `rjust()`_, `center()`_, +and the grapheme-aware function `wrap()`_, serving as drop-in replacements to python standard +functions. The `clip()`_ function extracts substrings by their displayed column positions, and `strip_sequences()`_ removes terminal escape sequences from text altogether. @@ -61,19 +61,16 @@ Discrepancies You may find that support *varies* for complex unicode sequences or codepoints. This library may be considered to presume the terminal is enabled for DEC Private Mode 2027 ("Grapheme Clustering"), but -that specification does not fully describe support of varying unicode versions, feature levels, or -make any interpretation of standards for all languages or complex scripts. See `Grapheme Clusters -and Terminal Emulators`_ and `terminal-unicode-core.tex`_ for more details about testing and -enabling mode 2027 support. +the specification does not fully describe varying unicode versions, feature levels, or details of +specific language support. This library does *not* support any alternate "legacy width" measurement. -This library takes a progressive approach of interpretation of standards (specification_), and -where interpretation is unclear, to match popular modern terminals. This library does *not* support -any alternate "legacy width" measurement. +See `Grapheme Clusters and Terminal Emulators`_ and `terminal-unicode-core.tex`_, and `State of +Terminal Emulators in 2025`_ for more details on Mode 2027 and unicode-aware terminals. -A companion utility, `jquast/ucs-detect`_ was authored to gather and publish the results of Wide -character, language/grapheme clustering and complex script support, emojis and zero-width joiner, -variations, and regional indicator (flags) as a `General Tabulated Summary`_ by terminal emulator -software and version. +The `jquast/ucs-detect`_ utility is used to gather and publish the results of compliance to our +standard for Wide character, Languages, grapheme clustering, complex or combining scripts, emojis, +zero-width joiner, variations, and regional indicator (flags) as a `General +Tabulated Summary`_ by terminal emulator software and version. ======== Overview @@ -176,8 +173,12 @@ such as clearing the screen, vertical, or absolute cursor movement will raise `` ... ValueError: Indeterminate cursor sequence at position 0, '\x1b[H' -TODO: should raise ValueError (out of bounds), ->>> wcwidth.width('a\x1b[5Da', control_codes='strict') + + >>> # cursor left movement beyond string start raises in strict mode, + >>> wcwidth.width('a\x1b[5Da', control_codes='strict') + Traceback (most recent call last): + ... + ValueError: Cursor left movement at position 1 would move 5 cells left from column 1, exceeding string start iter_sequences() @@ -301,6 +302,16 @@ Use `clip()`_ to extract a substring by column positions, preserving terminal se >>> clip('\x1b[31m中文\x1b[32m', 0, 3, propagate_sgr=False) '\x1b[31m中 \x1b[32m' + + >>> # Cursor-left overwrites previous text (painter's algorithm) + >>> clip('hello\x1b[2DXY', 0, 5) + 'helXY' + >>> # Carriage return resets to column 0, overwriting earlier cells + >>> clip('abc\rXY', 0, 5) + 'XYc' + >>> # OSC 8 hyperlink text is clipped and the hyperlink rebuilt + >>> clip('\x1b]8;;http://x.com\x07Click This link\x1b]8;;\x07', 6, 10) + '\x1b]8;;http://x.com\x07This\x1b]8;;\x07' strip_sequences() ----------------- @@ -503,10 +514,15 @@ History ======= 0.7.0 *2026-04-30* - * **Improved** `clip()` and `width()` to support horizontal cursor sequences, especially - cursor_left (``cub``) can now overwrite previous rows, matching terminal behavior. - column_address (``hpa``) and carriage return (``\r``) are now parsed to move to beginning - of string, or, raise ValueError on ``strict``. + * **New** ``control_codes`` parameter for `clip()`_, supporting ``'parse'`` (default), + ``'strict'``, and ``'ignore'`` modes for control character and cursor movement handling. + * **Improved** `clip()`_ with OSC 8 hyperlink-aware clipping: visible text inside hyperlinks + is clipped to the requested column range, and the hyperlink is rebuilt around the clipped text. + * **Improved** `clip()`_ and `width()`_ to support horizontal cursor sequences (``cub``, ``cuf``, + ``hpa``). Cursor-left (``cub``) can now overwrite previous text, matching terminal behavior. + ``column_address`` (``hpa``) and carriage return (``\r``) are now parsed, or raise + ``ValueError`` on ``strict``. Cursor-left movement beyond string start raises ``ValueError`` + in strict mode. 0.6.0 *2026-02-06* * **New** Parameters ``expand_tabs``, ``replace_whitespace``, ``fix_sentence_endings``, @@ -794,7 +810,7 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c:: .. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ .. _`Grapheme Clusters and Terminal Emulators`: https://mitchellh.com/writing/grapheme-clusters-in-terminals .. _`terminal-unicode-core.tex`: https://github.com/contour-terminal/terminal-unicode-core/blob/master/spec/terminal-unicode-core.tex - +.. _`State of Terminal Emulators in 2025`: https://www.jeffquast.com/post/state-of-terminal-emulation-2025/ .. |pypi_downloads| image:: https://img.shields.io/pypi/dm/wcwidth.svg?logo=pypi :alt: Downloads :target: https://pypi.org/project/wcwidth/ diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index f2ceee6..7b8b2b9 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -328,7 +328,7 @@ def test_iter_sequences_mixed(benchmark): benchmark(lambda: list(wcwidth.iter_sequences(text))) -# Brahmic script benchmarks — text with virama conjuncts +# Brahmic script benchmarks -- text with virama conjuncts BRAHMIC_DEVANAGARI = 'हिन्दी भाषा में लिखा गया पाठ है। क्षत्रिय स्त्री ' * 20 BRAHMIC_BENGALI = 'বাংলা ভাষায় লেখা একটি পাঠ। বাঙ্গালী ভাষা ' * 20 diff --git a/tests/test_clip.py b/tests/test_clip.py index 04fa520..9e51851 100644 --- a/tests/test_clip.py +++ b/tests/test_clip.py @@ -138,7 +138,7 @@ def test_clip_sequences_osc_hyperlink(): ) -# ── OSC 8 hyperlink clipping ────────────────────────────────────────────────── +# OSC 8 hyperlink clipping OSC_START_BEL = '\x1b]8;;http://example.com\x07' OSC_END_BEL = '\x1b]8;;\x07' @@ -147,26 +147,26 @@ def test_clip_sequences_osc_hyperlink(): CLIP_HYPERLINK_CASES = [ - # Full hyperlink visible — preserved as-is + # Full hyperlink visible -- preserved as-is (f'{OSC_START_BEL}link{OSC_END_BEL}', 0, 4, f'{OSC_START_BEL}link{OSC_END_BEL}'), - # Clipping middle of hyperlink text — rebuild around clipped inner text + # Clipping middle of hyperlink text -- rebuild around clipped inner text (f'{OSC_START_BEL}Click This link{OSC_END_BEL}', 6, 10, f'{OSC_START_BEL}This{OSC_END_BEL}'), - # Clipping from start — only first portion + # Clipping from start -- only first portion (f'{OSC_START_BEL}Click This{OSC_END_BEL}', 0, 5, f'{OSC_START_BEL}Click{OSC_END_BEL}'), - # Clipping from end — only last portion + # Clipping from end -- only last portion (f'{OSC_START_BEL}Click This{OSC_END_BEL}', 6, 10, f'{OSC_START_BEL}This{OSC_END_BEL}'), - # Hyperlink entirely before clip window — dropped + # Hyperlink entirely before clip window -- dropped (f'{OSC_START_BEL}link{OSC_END_BEL}world', 0, 4, f'{OSC_START_BEL}link{OSC_END_BEL}'), - # Hyperlink entirely after clip window — dropped + # Hyperlink entirely after clip window -- dropped (f'hello{OSC_START_BEL}link{OSC_END_BEL}', 0, 5, 'hello'), - # Hyperlink clipped to nothing — empty hyperlink dropped + # Hyperlink clipped to nothing -- empty hyperlink dropped (f'{OSC_START_BEL}link{OSC_END_BEL}', 5, 10, ''), - # Empty hyperlink (no inner text) — dropped + # Empty hyperlink (no inner text) -- dropped (f'before{OSC_START_BEL}{OSC_END_BEL}after', 0, 11, 'beforeafter'), # Hyperlink with CJK text clipped (f'{OSC_START_BEL}中文文字{OSC_END_BEL}', 0, 4, @@ -189,7 +189,7 @@ def test_clip_sequences_osc_hyperlink(): # SGR inside hyperlink is preserved (f'{OSC_START_BEL}\x1b[31mred link\x1b[0m{OSC_END_BEL}', 4, 8, f'{OSC_START_BEL}\x1b[31mlink\x1b[0m{OSC_END_BEL}'), - # Hyperlink open without matching close — preserved as regular sequence + # Hyperlink open without matching close -- preserved as regular sequence ('\x1b]8;;http://x.com\x07link', 0, 4, '\x1b]8;;http://x.com\x07link'), # Nested hyperlinks ('\x1b]8;;a\x07ABCD \x1b]8;;b\x07XY\x1b]8;;\x07 EF\x1b]8;;\x07', 0, 14, @@ -230,11 +230,11 @@ def test_clip_osc_hyperlink_strict_raises(): CLIP_HYPERLINK_PAINTER_CASES = [ # Empty hyperlink dropped (f'\x1b[2D{OSC_START_BEL}{OSC_END_BEL}xy', 'parse', 0, 4, 'xy'), - # Hyperlink entirely after clip window — skipped + # Hyperlink entirely after clip window -- skipped (f'\x1b[2Dab{OSC_START_BEL}cde{OSC_END_BEL}', 'parse', 0, 2, 'ab'), - # Hyperlink entirely before clip window — skipped + # Hyperlink entirely before clip window -- skipped (f'{OSC_START_BEL}ab{OSC_END_BEL}\x1b[2Dcdef', 'parse', 2, 4, 'ef'), - # Hyperlink overlapping clip window — clipped + # Hyperlink overlapping clip window -- clipped (f'\x1b[2D{OSC_START_BEL}abcdef{OSC_END_BEL}', 'parse', 0, 3, f'{OSC_START_BEL}abc{OSC_END_BEL}'), # Bare ESC inside hyperlink in painter path diff --git a/tests/test_clip_cursors.py b/tests/test_clip_cursors.py index eb1c635..03ad940 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_cursors.py @@ -102,3 +102,21 @@ def test_clip_cursor_sequences_expected_behaviour(text, start, end, kwargs, expe """Verify clip() output matches terminal-visible columns after cursor moves.""" result = clip(text, start, end, **kwargs) assert repr(result) == repr(expected) + + +def test_clip_cursor_left_strict_out_of_bounds(): + """clip() with control_codes='strict' raises on cursor-left beyond string start.""" + with pytest.raises(ValueError, match='Cursor left movement'): + clip('a\x1b[5Da', 0, 1, control_codes='strict') + + +def test_clip_cursor_left_strict_out_of_bounds_painter(): + """clip() strict-mode raises on cursor-left beyond start in painter path.""" + with pytest.raises(ValueError, match='Cursor left movement'): + clip('\x1b[2Dab', 0, 2, control_codes='strict') + + +def test_clip_cursor_left_out_of_bounds_parse_no_raise(): + """clip() parse mode silently clamps cursor-left beyond start.""" + assert clip('a\x1b[5Da', 0, 1) == 'a' + assert clip('ab\x1b[99Dcd', 0, 4) == 'cd' diff --git a/tests/test_width.py b/tests/test_width.py index 30a5903..d94bc7a 100644 --- a/tests/test_width.py +++ b/tests/test_width.py @@ -66,7 +66,7 @@ def test_width_control_codes_strict_raises(text, name): ('abc\bd', 3, 'backspace'), ('\x1b[31mred\x1b[0m', 3, 'SGR_sequence'), ('a\x1b[2Cb', 4, 'cursor_right'), - ('a\x1b[3Db', 1, 'cursor_left'), + ('ab\x1b[Db', 2, 'cursor_left'), ('\x1b', 0, 'lone_ESC'), ('a\x1bb', 1, 'fs_sequence_between'), ('\x1b!', 1, 'ESC_unrecognized'), @@ -294,6 +294,23 @@ def test_hpa_strict_raises(): wcwidth.width('abc\x1b[5Gde', control_codes='strict') +def test_cursor_left_strict_out_of_bounds(): + """Cursor-left beyond string start raises ValueError in strict mode.""" + with pytest.raises(ValueError, match='Cursor left movement'): + wcwidth.width('a\x1b[5Da', control_codes='strict') + + +def test_cursor_left_out_of_bounds_parse_no_raise(): + """Cursor-left beyond string start is silently clamped in parse mode.""" + assert wcwidth.width('a\x1b[5Da') == 1 + assert wcwidth.width('abc\x1b[99Ddef') == 3 # 99D clamped to col 0, then b,c,d overwritten + + +def test_cursor_left_out_of_bounds_ignore_mode(): + """Cursor-left beyond string start is zero-width in ignore mode.""" + assert wcwidth.width('a\x1b[5Da', control_codes='ignore') == 2 + + def test_iter_sequences_lone_esc(): """Lone ESC is yielded as a sequence.""" assert list(wcwidth.iter_sequences('\x1b')) == [('\x1b', True)] diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index d8635d1..00fa177 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -1,4 +1,6 @@ """This is a python implementation of clip().""" +from __future__ import annotations + # std imports import re @@ -89,8 +91,11 @@ def clip( TAB characters (``\t``) are expanded to spaces up to the next tab stop, controlled by the ``tabsize`` parameter. - Other cursor movement characters (backspace, carriage return) and cursor - movement sequences are passed through unchanged as zero-width. + When no horizontal cursor movements are present (backspace, carriage return, or + CSI C/D/G sequences), cursor movement characters and sequences are passed through + as zero-width sequences. When cursor movement is detected, a "painter's + algorithm" is used instead: cursor movements actively change the write position, + allowing cursor-left and carriage return to overwrite previously written cells. **OSC 8 hyperlinks** are handled specially: the visible text inside a hyperlink is clipped to the requested column range, and the hyperlink is rebuilt around @@ -155,7 +160,6 @@ def clip( >>> clip('a\tb', 0, 10) # Tab expanded to spaces 'a b' """ - # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements,too-many-nested-blocks,W0101 start = max(start, 0) if end <= start: return '' @@ -188,9 +192,134 @@ def clip( if propagate_sgr: sgr = _SGR_STATE_DEFAULT + # Inner helpers + # Closure-based to avoid LOAD_GLOBAL overhead on hot-path calls. + # Each has low individual McCabe complexity. + + def _mark_sgr_capture() -> None: + """Record SGR state at first visible emit, if not already captured.""" + nonlocal sgr_at_clip_start + if propagate_sgr and sgr_at_clip_start is None: + sgr_at_clip_start = sgr + + def _process_hyperlink( + hl_state: _HyperlinkState, match_end: int, col: int, + ) -> tuple[str, object]: + """Process OSC 8 hyperlink unit. + + Returns (action, data): + action='no_close' -> data unused (emit as regular seq, advance past match_end) + action='empty' -> data is close_end (skip entirely) + action='outside' -> data is (inner_width, close_end) (advance col, skip) + action='visible' -> data is (open_seq, clipped_inner, close_seq, + inner_width, hl_col_end, close_end) + """ + close_span = _find_hyperlink_close(text, match_end) + if close_span is None: + return ('no_close', None) + + close_start, close_end = close_span + inner_text = text[match_end:close_start] + inner_width = width( + inner_text, control_codes=control_codes, + tabsize=tabsize, ambiguous_width=ambiguous_width, + ) + + if inner_width == 0: + return ('empty', close_end) + + hl_col_start = col + hl_col_end = col + inner_width + + if hl_col_end <= start or hl_col_start >= end: + return ('outside', (inner_width, close_end)) + + inner_clip_start = max(0, start - col) + inner_clip_end = end - col + + clipped_inner = clip( + inner_text, inner_clip_start, inner_clip_end, + fillchar=fillchar, tabsize=tabsize, + ambiguous_width=ambiguous_width, + propagate_sgr=False, + control_codes=control_codes, + ) + + return ('visible', ( + _make_hyperlink_open(hl_state), + clipped_inner, + _make_hyperlink_close(hl_state.terminator), + inner_width, + hl_col_end, + close_end, + )) + + def _emit_tab_simple(col: int, output: list[str]) -> int: + """Expand tab for simple-path, appending spaces to output list.""" + if tabsize > 0: + next_tab = col + (tabsize - (col % tabsize)) + while col < next_tab: + if start <= col < end: + output.append(' ') + _mark_sgr_capture() + col += 1 + else: + output.append('\t') + return col + + def _emit_tab_painter(col: int, write_cells, append_seq) -> int: + """Expand tab for painter-path.""" + if tabsize > 0: + next_tab = col + (tabsize - (col % tabsize)) + while col < next_tab: + if start <= col < end: + write_cells(' ', 1, col) + col += 1 + else: + append_seq('\t') + return col + + def _handle_grapheme_simple( + grapheme: str, gw: int, col: int, output: list[str], + ) -> None: + """Emit grapheme to simple-path output list based on visibility.""" + if gw == 0: + if start <= col < end: + output.append(grapheme) + elif col >= start and col + gw <= end: + output.append(grapheme) + _mark_sgr_capture() + elif col < end and col + gw > start: + output.append(fillchar * (min(end, col + gw) - max(start, col))) + _mark_sgr_capture() + + def _handle_grapheme_painter( + grapheme: str, gw: int, col: int, write_cells, append_seq, + ) -> None: + """Emit grapheme to painter-path based on visibility.""" + if gw == 0: + if start <= col < end: + append_seq(grapheme) + elif col >= start and col + gw <= end: + write_cells(grapheme, gw, col) + elif col < end and col + gw > start: + clip_start = max(start, col) + for offset in range(min(end, col + gw) - clip_start): + write_cells(fillchar, 1, clip_start + offset) + + def _apply_sgr_wrap(result: str) -> str: + """Apply SGR prefix/suffix around result.""" + if sgr_at_clip_start is not None: + if prefix := _sgr_state_to_sequence(sgr_at_clip_start): + result = prefix + result + if _sgr_state_is_active(sgr_at_clip_start): + result += '\x1b[0m' + return result + + # Main loops + if not use_painter: - # Simple path: no cursor movement — direct output.append() is sufficient. - # This matches the original (master-branch) clip performance characteristics. + # Simple path: no cursor movement output: list[str] = [] col = 0 idx = 0 @@ -210,233 +339,134 @@ def clip( idx += 1 continue - seq = m.group() - # SGR handling: update state, don't emit sequence if m.group('sgr_params') is not None and propagate_sgr and sgr: - sgr = _sgr_state_update(sgr, seq) + sgr = _sgr_state_update(sgr, m.group()) idx = m.end() continue - # OSC 8 hyperlink open: process as a unit (recursively clip inner text) - if (hl_state := _parse_hyperlink_open(seq)): - close_span = _find_hyperlink_close(text, m.end()) - if close_span is None: - # No matching close: treat as regular zero-width sequence - output.append(seq) + # OSC 8 hyperlink + if hl_state := _parse_hyperlink_open(m.group()): + action, data = _process_hyperlink(hl_state, m.end(), col) + if action == 'no_close': + output.append(m.group()) idx = m.end() - continue - - close_start, close_end = close_span - inner_text = text[m.end():close_start] - inner_width = width( - inner_text, control_codes=control_codes, - tabsize=tabsize, ambiguous_width=ambiguous_width, - ) - - if inner_width == 0: - # Empty hyperlink: drop entirely + elif action == 'empty': + idx = data + elif action == 'outside': + inner_width, close_end = data + col += inner_width idx = close_end - continue - - # Determine if hyperlink column range overlaps clip window - hl_col_start = col - hl_col_end = col + inner_width - - if hl_col_end <= start or hl_col_start >= end: - # Hyperlink entirely outside clip window: skip it + else: # 'visible' + open_seq, clipped_inner, close_seq, inner_width, _, close_end = data + output.append(open_seq) + output.append(clipped_inner) + output.append(close_seq) + _mark_sgr_capture() col += inner_width idx = close_end - continue - - # Hyperlink overlaps clip window: recursively clip inner text - inner_clip_start = max(0, start - col) - inner_clip_end = end - col - - clipped_inner = clip( - inner_text, inner_clip_start, inner_clip_end, - fillchar=fillchar, tabsize=tabsize, - ambiguous_width=ambiguous_width, - propagate_sgr=False, - control_codes=control_codes, - ) - - output.append(_make_hyperlink_open(hl_state)) - output.append(clipped_inner) - output.append(_make_hyperlink_close(hl_state.terminator)) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - - col += inner_width - idx = close_end continue # Any other recognized sequence preserved as-is - output.append(seq) + output.append(m.group()) idx = m.end() continue # TAB expansion if char == '\t': - if tabsize > 0: - next_tab = col + (tabsize - (col % tabsize)) - while col < next_tab: - if start <= col < end: - output.append(' ') - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - col += 1 - else: - output.append(char) + col = _emit_tab_simple(col, output) idx += 1 continue - # Grapheme clustering for everything else + # Grapheme clustering grapheme = next(iter_graphemes(text, start=idx)) grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) - - if grapheme_w == 0: - # combining/zero-width grapheme; preserve as token at this column - if start <= col < end: - output.append(grapheme) - elif col >= start and col + grapheme_w <= end: - # Fully visible - output.append(grapheme) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - elif col < end and col + grapheme_w > start: - # Partially visible (wide char at boundary) — emit fillchars - output.append(fillchar * (min(end, col + grapheme_w) - max(start, col))) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - # advance column whether visible or not + _handle_grapheme_simple(grapheme, grapheme_w, col, output) col += grapheme_w idx += len(grapheme) - result = ''.join(output) - else: - # Painter's algorithm path: handles cursor movement (BS, CR, CSI C/D/G) - # that can overwrite previously emitted cells. - - # map column integer to a visible character (with its width) - cells: dict[int, tuple[str, int]] = {} - # set of column positions belonging to hyperlink visible cells (for strict mode) - hyperlink_cells: set[int] = set() - # map column integer to a list of zero-width sequences emitted at that position - # (col, seq_order, text) - sequences: list[tuple[int, int, str]] = [] - # ordering of sequences - seq_order = 0 - - col = 0 - idx = 0 - - def _write_cells(s: str, w: int, write_col: int, - is_hyperlink: bool = False) -> None: - nonlocal sgr_at_clip_start - # Strict-mode check: overwriting hyperlink cells is indeterminate - if strict and not is_hyperlink: - for offset in range(w): - if write_col + offset in hyperlink_cells: - raise ValueError( - f"Cursor movement at column {write_col + offset} " - f"would overwrite an OSC 8 hyperlink cell. " - f"Use control_codes='parse' to allow this." - ) - # Fix up wide-char orphans and clear overwritten cells in one pass - for offset in range(w): - src_col = write_col + offset - if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: - cells[src_col - 1] = (fillchar, 1) - hyperlink_cells.discard(src_col - 1) - if cells.get(src_col, ('', 0))[1] == 2: - cells[src_col + 1] = (fillchar, 1) - hyperlink_cells.discard(src_col + 1) - cells.pop(src_col, None) - hyperlink_cells.discard(src_col) - cells[write_col] = (s, w) - if is_hyperlink: - for offset in range(w): - hyperlink_cells.add(write_col + offset) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - - def _append_seq(seq: str, at_col: Optional[int] = None) -> None: - nonlocal sgr_at_clip_start, seq_order - c = col if at_col is None else at_col - sequences.append((c, seq_order, seq)) - seq_order += 1 - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - - while idx < len(text): - char = text[idx] - - # Early exit: past visible region, SGR captured, no escape ahead - if col >= end and sgr_at_clip_start is not None and char != '\x1b': - break + result = _apply_sgr_wrap(''.join(output)) + return result - # 1. Handle escape sequences and bare ESC — single regex dispatch - if char == '\x1b': - m = _SEQUENCE_CLASSIFY.match(text, idx) - if not m: - _append_seq(char) - idx += 1 - continue + # Painter's algorithm path: handles cursor movement + cells: dict[int, tuple[str, int]] = {} + hyperlink_cells: set[int] = set() + sequences: list[tuple[int, int, str]] = [] + seq_order = 0 - seq = m.group() + col = 0 + idx = 0 - # Dispatch on which named group captured: - if (m.group('sgr_params')) is not None and (propagate_sgr and sgr): - sgr = _sgr_state_update(sgr, seq) - idx = m.end() - continue + def _write_cells(s: str, w: int, write_col: int, + is_hyperlink: bool = False) -> None: + nonlocal sgr_at_clip_start + if strict and not is_hyperlink: + for offset in range(w): + if write_col + offset in hyperlink_cells: + raise ValueError( + f"Cursor movement at column {write_col + offset} " + f"would overwrite an OSC 8 hyperlink cell. " + f"Use control_codes='parse' to allow this." + ) + for offset in range(w): + src_col = write_col + offset + if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: + cells[src_col - 1] = (fillchar, 1) + hyperlink_cells.discard(src_col - 1) + if cells.get(src_col, ('', 0))[1] == 2: + cells[src_col + 1] = (fillchar, 1) + hyperlink_cells.discard(src_col + 1) + cells.pop(src_col, None) + hyperlink_cells.discard(src_col) + cells[write_col] = (s, w) + if is_hyperlink: + for offset in range(w): + hyperlink_cells.add(write_col + offset) + _mark_sgr_capture() - # OSC 8 hyperlink open: process as a unit (recursively clip inner text) - if (hl_state := _parse_hyperlink_open(seq)): - close_span = _find_hyperlink_close(text, m.end()) - if close_span is None: - # No matching close: treat as regular sequence - _append_seq(seq) - idx = m.end() - continue + def _append_seq(seq: str, at_col: Optional[int] = None) -> None: + nonlocal seq_order + c = col if at_col is None else at_col + sequences.append((c, seq_order, seq)) + seq_order += 1 + _mark_sgr_capture() - close_start, close_end = close_span - inner_text = text[m.end():close_start] - inner_width = width( - inner_text, control_codes=control_codes, - tabsize=tabsize, ambiguous_width=ambiguous_width, - ) + while idx < len(text): + char = text[idx] - if inner_width == 0: - # Empty hyperlink: drop entirely - idx = close_end - continue + # Early exit: past visible region, SGR captured, no escape ahead + if col >= end and sgr_at_clip_start is not None and char != '\x1b': + break - # Determine if hyperlink column range overlaps clip window - hl_col_start = col - hl_col_end = col + inner_width + # 1. Handle escape sequences -- single regex dispatch + if char == '\x1b': + m = _SEQUENCE_CLASSIFY.match(text, idx) + if not m: + _append_seq(char) + idx += 1 + continue - if hl_col_end <= start or hl_col_start >= end: - # Hyperlink entirely outside clip window: skip it - col += inner_width - idx = close_end - continue - - # Hyperlink overlaps clip window: recursively clip inner text - inner_clip_start = max(0, start - col) - inner_clip_end = end - col - - clipped_inner = clip( - inner_text, inner_clip_start, inner_clip_end, - fillchar=fillchar, tabsize=tabsize, - ambiguous_width=ambiguous_width, - propagate_sgr=False, - control_codes=control_codes, - ) + # SGR handling: update state, don't emit sequence + if m.group('sgr_params') is not None and propagate_sgr and sgr: + sgr = _sgr_state_update(sgr, m.group()) + idx = m.end() + continue - # Emit hyperlink open as sequence, then clipped cells - _append_seq(_make_hyperlink_open(hl_state)) + # OSC 8 hyperlink + if hl_state := _parse_hyperlink_open(m.group()): + action, data = _process_hyperlink(hl_state, m.end(), col) + if action == 'no_close': + _append_seq(m.group()) + idx = m.end() + elif action == 'empty': + idx = data + elif action == 'outside': + inner_width, close_end = data + col += inner_width + idx = close_end + else: # 'visible' + open_seq, clipped_inner, close_seq, inner_width, hl_col_end, close_end = data + _append_seq(open_seq) inner_clipped_width = width( clipped_inner, control_codes=control_codes, tabsize=tabsize, ambiguous_width=ambiguous_width, @@ -444,144 +474,106 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: _write_cells(clipped_inner, inner_clipped_width, col, is_hyperlink=True) col += inner_clipped_width - # Emit hyperlink close as sequence after the cells - _append_seq(_make_hyperlink_close(hl_state.terminator), - at_col=col) - + _append_seq(close_seq, at_col=col) # Advance past the original hyperlink content col = hl_col_end idx = close_end - continue - - # 1a. HPA: horizontal position absolute (CSI n G) - if (hpa_n := m.group('hpa_n')) is not None: - col = int(hpa_n) - 1 if hpa_n else 0 - idx = m.end() - continue - - # 1b. Cursor forward, - if (cforward_n := m.group('cforward_n')) is not None: - n_forward = int(cforward_n) if cforward_n else 1 - move_end = col + n_forward - if col < end and move_end > start: - for i in range(max(col, start), min(move_end, end)): - _write_cells(fillchar, 1, i) - col = move_end - idx = m.end() - continue - - # 1c. Cursor backward, - if (cbackward_n := m.group('cbackward_n')) is not None: - n_backward = int(cbackward_n) if cbackward_n else 1 - col = max(0, col - n_backward) - idx = m.end() - continue + continue - # 1d. Any other recognized zero-width sequence - _append_seq(seq) + # 1a. HPA: horizontal position absolute (CSI n G) + if (hpa_n := m.group('hpa_n')) is not None: + col = int(hpa_n) - 1 if hpa_n else 0 idx = m.end() continue - # 2. Carriage return and backspace (before TAB/grapheme fallthrough) - if char == '\r': - # CR: reset column to 0 - col = 0 - idx += 1 + # 1b. Cursor forward + if (cforward_n := m.group('cforward_n')) is not None: + n_forward = int(cforward_n) if cforward_n else 1 + move_end = col + n_forward + if col < end and move_end > start: + for i in range(max(col, start), min(move_end, end)): + _write_cells(fillchar, 1, i) + col = move_end + idx = m.end() continue - if char == '\x08': - # BS: decrement column - if col > 0: - col -= 1 - idx += 1 + # 1c. Cursor backward + if (cbackward_n := m.group('cbackward_n')) is not None: + n_backward = int(cbackward_n) if cbackward_n else 1 + if strict and n_backward > col: + raise ValueError( + f"Cursor left movement at position {idx} would move " + f"{n_backward} cells left from column {col}, " + f"exceeding string start" + ) + col = max(0, col - n_backward) + idx = m.end() continue - # 3. TAB expansion - if char == '\t': - if tabsize > 0: - next_tab = col + (tabsize - (col % tabsize)) - while col < next_tab: - if start <= col < end: - _write_cells(' ', 1, col) - col += 1 - else: - # preserve tab as-is - _append_seq(char) - idx += 1 - continue + # 1d. Any other recognized zero-width sequence + _append_seq(m.group()) + idx = m.end() + continue - # 4. Grapheme clustering for everything else - grapheme = next(iter_graphemes(text, start=idx)) - grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) + # 2. Carriage return and backspace (before TAB/grapheme fallthrough) + if char == '\r': + col = 0 + idx += 1 + continue - if grapheme_w == 0: - # combining/zero-width grapheme; preserve as token at this column - if start <= col < end: - _append_seq(grapheme) - elif col >= start and col + grapheme_w <= end: - # Fully visible - _write_cells(grapheme, grapheme_w, col) - elif col < end and col + grapheme_w > start: - # Partially visible (wide char at boundary) — emit fillchars - clip_start = max(start, col) - for i in range(min(end, col + grapheme_w) - clip_start): - _write_cells(fillchar, 1, clip_start + i) - # advance column whether visible or not - col += grapheme_w - idx += len(grapheme) + if char == '\x08': + if col > 0: + col -= 1 + idx += 1 + continue - # Reconstruct result from "painter's algorithm", this allows us to - # accurately depict clipping with horizontal movement - seqs_by_col: dict[int, list[tuple[int, str]]] = {} - for col_pos, order, seq_text in sequences: - seqs_by_col.setdefault(col_pos, []).append((order, seq_text)) - for entries in seqs_by_col.values(): - entries.sort() - - max_cell_col = max(cells.keys()) if cells else -1 - max_seq_col = max(seqs_by_col.keys()) if seqs_by_col else -1 - max_col = max(max_cell_col, max_seq_col) - - # Walk columns 0..min(max_col, end), emitting sequences then any cell - # or fillchar occupying each position. Visits *inclusive* of - # min(max_col, end) so sequences at `end` are preserved. - parts: list[str] = [] - walk_col = 0 - col_limit = min(max_col, end) - while walk_col <= col_limit: - # Zero-width sequences at this column - for _, seq_text in seqs_by_col.get(walk_col, ()): - parts.append(seq_text) + # 3. TAB expansion + if char == '\t': + col = _emit_tab_painter(col, _write_cells, _append_seq) + idx += 1 + continue - if walk_col >= end: - walk_col += 1 - continue + # 4. Grapheme clustering + grapheme = next(iter_graphemes(text, start=idx)) + grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) + _handle_grapheme_painter(grapheme, grapheme_w, col, _write_cells, _append_seq) + col += grapheme_w + idx += len(grapheme) + + # Reconstruct result from "painter's algorithm" + seqs_by_col: dict[int, list[tuple[int, str]]] = {} + for col_pos, order, seq_text in sequences: + seqs_by_col.setdefault(col_pos, []).append((order, seq_text)) + for entries in seqs_by_col.values(): + entries.sort() + + max_cell_col = max(cells.keys()) if cells else -1 + max_seq_col = max(seqs_by_col.keys()) if seqs_by_col else -1 + max_col = max(max_cell_col, max_seq_col) + + parts: list[str] = [] + walk_col = 0 + col_limit = min(max_col, end) + while walk_col <= col_limit: + for _, seq_text in seqs_by_col.get(walk_col, ()): + parts.append(seq_text) + + if walk_col >= end: + walk_col += 1 + continue + + if walk_col in cells: + cell_text, cell_w = cells[walk_col] + parts.append(cell_text) + walk_col += cell_w + else: + if start <= walk_col <= max_cell_col: + parts.append(fillchar) + walk_col += 1 + + for c in sorted(seqs_by_col.keys()): + if c > col_limit: + for _, seq_text in seqs_by_col[c]: + parts.append(seq_text) - if walk_col in cells: - cell_text, cell_w = cells[walk_col] - # All cells satisfy walk_col >= start and walk_col + cell_w <= end - parts.append(cell_text) - walk_col += cell_w - else: - # Hole: emit fillchar for columns inside (start, end) that lie - # within the written cell area - if start <= walk_col <= max_cell_col: - parts.append(fillchar) - walk_col += 1 - - # Trailing sequences past col_limit (SGR resets after short text, etc.) - for c in sorted(seqs_by_col.keys()): - if c > col_limit: - for _, seq_text in seqs_by_col[c]: - parts.append(seq_text) - - result = ''.join(parts) - - # Apply SGR prefix/suffix - if sgr_at_clip_start is not None: - if prefix := _sgr_state_to_sequence(sgr_at_clip_start): - result = prefix + result - if _sgr_state_is_active(sgr_at_clip_start): - result += '\x1b[0m' - - return result + return _apply_sgr_wrap(''.join(parts)) \ No newline at end of file diff --git a/wcwidth/_wcswidth.py b/wcwidth/_wcswidth.py index 423a6af..6eb7edd 100644 --- a/wcwidth/_wcswidth.py +++ b/wcwidth/_wcswidth.py @@ -74,7 +74,7 @@ def wcswidth( if ucs == 0x200D: if last_was_virama: # ZWJ after virama requests explicit half-form rendering but - # does not change cell count — consume ZWJ only, let the next + # does not change cell count -- consume ZWJ only, let the next # consonant be handled by the virama conjunct rule. idx += 1 elif idx + 1 < end: diff --git a/wcwidth/_width.py b/wcwidth/_width.py index 4ad6531..5b81400 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -87,6 +87,11 @@ def width( .. versionadded:: 0.3.0 + .. versionchanged:: 0.7.0 + Expanded strict-mode to raise :exc:`ValueError` when cursor-left movement + (CSI D) would move beyond the beginning of the string. Previously, cursor-left + was silently clamped to column 0 in all modes. + Examples:: >>> width('hello') @@ -173,7 +178,14 @@ def width( elif (cforward_n := m.group('cforward_n')) is not None: current_col += int(cforward_n) if cforward_n else 1 elif (cbackward_n := m.group('cbackward_n')) is not None: - current_col = max(0, current_col - (int(cbackward_n) if cbackward_n else 1)) + n_backward = int(cbackward_n) if cbackward_n else 1 + if strict and n_backward > current_col: + raise ValueError( + f"Cursor left movement at position {idx} would move " + f"{n_backward} cells left from column {current_col}, " + f"exceeding string start" + ) + current_col = max(0, current_col - n_backward) # 2d. SGR and other zero-width sequences -- no column advance idx = m.end() max_extent = max(max_extent, current_col) @@ -214,7 +226,7 @@ def width( if char == '\u200D': if last_was_virama: # ZWJ after virama requests explicit half-form rendering but - # does not change cell count — consume ZWJ only, let the next + # does not change cell count -- consume ZWJ only, let the next # consonant be handled by the virama conjunct rule. idx += 1 elif idx + 1 < text_len: diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index 655910a..7a40d06 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -364,7 +364,7 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m lines.append(indent + line_content) is_first_line = False else: - # max_lines reached with remaining content — + # max_lines reached with remaining content -- # pop chunks until placeholder fits, then break. placeholder_w = self._width(self.placeholder) while current_line: From dbb742fdaef5c8843ad9da0c20f02116bb070d96 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 17:04:26 -0400 Subject: [PATCH 51/70] link to more wcwidth's --- docs/intro.rst | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/docs/intro.rst b/docs/intro.rst index 3c91c10..39adb0a 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -496,6 +496,10 @@ Other Languages There are similar implementations of the `wcwidth()`_ and `wcswidth()`_ functions in other languages. +- `contour-terminal/libunicode`_: C++20 +- `ridiculousfish/widecharwidth`_: Python +- `termux/wcwidth`_: C +- `powerman/wcwidth-icons`_: C - `timoxley/wcwidth`_: JavaScript - `janlelis/unicode-display_width`_: Ruby - `alecrabbit/php-wcwidth`_: PHP @@ -505,6 +509,9 @@ languages. - `grepsuzette/wcwidth`_: Haxe - `aperezdc/lua-wcwidth`_: Lua - `joachimschmidt557/zig-wcwidth`_: Zig +- `mycoboco/wcwidth.js`_: JavaScript +- `ainame/swift-displaywidth`_: Swift +- `pmonks/clj-wcwidth`_: Clojure - `fumiyas/wcwidth-cjk`_: `LD_PRELOAD` override - `joshuarubin/wcwidth9`_: Unicode version 9 in C - `spectreconsole/wcwidth`_: C# @@ -514,15 +521,12 @@ History ======= 0.7.0 *2026-04-30* - * **New** ``control_codes`` parameter for `clip()`_, supporting ``'parse'`` (default), - ``'strict'``, and ``'ignore'`` modes for control character and cursor movement handling. - * **Improved** `clip()`_ with OSC 8 hyperlink-aware clipping: visible text inside hyperlinks - is clipped to the requested column range, and the hyperlink is rebuilt around the clipped text. + * **New** `clip()`_ parameter ``control_codes='parse'``, ``'ignore'``, and ``'strict'``. `clip()`_ + is now able to clip OSC 8 hyperlinks. * **Improved** `clip()`_ and `width()`_ to support horizontal cursor sequences (``cub``, ``cuf``, ``hpa``). Cursor-left (``cub``) can now overwrite previous text, matching terminal behavior. - ``column_address`` (``hpa``) and carriage return (``\r``) are now parsed, or raise - ``ValueError`` on ``strict``. Cursor-left movement beyond string start raises ``ValueError`` - in strict mode. + ``column_address`` (``hpa``) and carriage return (``\r``) are now parsed, and some values + conditionally raise ``ValueError`` when ``control_codes='parse'``. 0.6.0 *2026-02-06* * **New** Parameters ``expand_tabs``, ``replace_whitespace``, ``fix_sentence_endings``, @@ -774,6 +778,13 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c:: .. _`fumiyas/wcwidth-cjk`: https://github.com/fumiyas/wcwidth-cjk .. _`joshuarubin/wcwidth9`: https://github.com/joshuarubin/wcwidth9 .. _`spectreconsole/wcwidth`: https://github.com/spectreconsole/wcwidth +.. _`contour-terminal/libunicode`: https://github.com/contour-terminal/libunicode +.. _`ridiculousfish/widecharwidth`: https://github.com/ridiculousfish/widecharwidth +.. _`termux/wcwidth`: https://github.com/termux/wcwidth +.. _`powerman/wcwidth-icons`: https://github.com/powerman/wcwidth-icons +.. _`mycoboco/wcwidth.js`: https://github.com/mycoboco/wcwidth.js +.. _`ainame/swift-displaywidth`: https://github.com/ainame/swift-displaywidth +.. _`pmonks/clj-wcwidth`: https://github.com/pmonks/clj-wcwidth .. _`python-cmd2/cmd2`: https://github.com/python-cmd2/cmd2 .. _`stratis-storage/stratis-cli`: https://github.com/stratis-storage/stratis-cli .. _`ihabunek/toot`: https://github.com/ihabunek/toot From 535959b971db4b7b24405d934abd941818bd1171 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Thu, 30 Apr 2026 20:19:55 -0400 Subject: [PATCH 52/70] experiment with simpler VS16 and GraphemeMeasurer() class is it too much overhead? we'll let the benchmarks decide .. --- docs/specs.rst | 5 +- tests/test_width.py | 32 +++--- wcwidth/_clip.py | 61 ++++------- wcwidth/_constants.py | 2 +- wcwidth/_wcswidth.py | 201 +++++++++++++++++++++--------------- wcwidth/_width.py | 104 +++---------------- wcwidth/escape_sequences.py | 28 +++++ wcwidth/textwrap.py | 37 +------ 8 files changed, 201 insertions(+), 269 deletions(-) diff --git a/docs/specs.rst b/docs/specs.rst index 852f081..098699f 100644 --- a/docs/specs.rst +++ b/docs/specs.rst @@ -99,8 +99,9 @@ an emoji base, they combine with the base and add 0 to total width. Any characters of `Modifier Symbol`_ category, ``'Sk'`` where ``'FULLWIDTH'`` is present in comment of `UnicodeData.txt`_, aprox. 3 characters. -Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by -`emoji-variation-sequences.txt`_ as ``emoji style``. +Any character with `U+FE0F`_ (Variation Selector 16) defined as ``emoji style`` +in `emoji-variation-sequences.txt`_: VS16 adds 1 cell to the narrow character +it directly follows, making the pair width 2. Wide characters are unchanged. Any character of non-zero width followed by an ``Mc`` (`Spacing Combining Mark`_) character when measured in sequence by :func:`wcwidth.wcswidth` or diff --git a/tests/test_width.py b/tests/test_width.py index d94bc7a..e0abe70 100644 --- a/tests/test_width.py +++ b/tests/test_width.py @@ -199,29 +199,26 @@ def test_vs16_selector(): def test_zwj_with_non_emoji_chars(): - """ZWJ with non-emoji characters and trailing VS16.""" - # ZWJ (Zero Width Joiner) skips both itself and the following character, treating them as a - # failed emoji ZWJ sequence. When followed by VS16, the VS16 should NOT apply to the earlier - # emoji because VS16 must immediately follow the character it modifies. - # - # In the full parse loop, VS16 checks `last_measured_idx == idx - 1` (immediate adjacency). - # The ZWJ+char skip means VS16 is not adjacent to the smiley, so VS16 has no effect. - # + """ZWJ with non-emoji characters and trailing VS16. + + These are invalid Unicode sequences (ZWJ followed by non-emoji), so + behavior is implementation-defined. The emoji base (smiley, width 1) + is narrow, and VS16 looks back to it across the ZWJ-consumed characters, + adding 1 cell for a total width of 2. + """ # Control test, assert wcwidth.width("\u263A\uFE0F") == 2 # smiley + VS16 = 2 - # ZWJ followed by non-emoji, VS16 does not apply (not adjacent) - assert wcwidth.width("\u263A\u200Da\uFE0F") == 1 - assert wcwidth.width("\u263A\u200Dx\uFE0F") == 1 - assert wcwidth.width("\u263A\u200Da\u200Db\uFE0F") == 1 + # ZWJ followed by non-emoji: VS16 applies to the smiley base + assert wcwidth.width("\u263A\u200Da\uFE0F") == 2 + assert wcwidth.width("\u263A\u200Dx\uFE0F") == 2 + assert wcwidth.width("\u263A\u200Da\u200Db\uFE0F") == 2 # ZWJ at end of string assert wcwidth.width("\u263A\u200D") == 1 # smiley + ZWJ = 1 # Long strings (>20 chars) use fast path which routes to wcswidth(). - # wcswidth() has more lenient VS16 handling, causing VS16 to incorrectly apply (!) - # Multiply by 10 to exceed threshold: "\u263A\u200Da\uFE0F" (4 chars) * 10 = 40 chars - assert wcwidth.width("\u263A\u200Da\uFE0F" * 10) == 20 # (smiley(1) + ZWJ+a(0) + VS16(+1)) * 10 (!) + assert wcwidth.width("\u263A\u200Da\uFE0F" * 10) == 20 def test_vs16_after_control_chars(): @@ -237,10 +234,9 @@ def test_vs16_after_control_chars(): assert wcwidth.width("\u263A\x0d\uFE0F") == 1 # smiley(1) + CR(reset) + VS16(0), extent=1 # Long strings (>20 chars) use fast path which routes to wcswidth(). - # wcswidth() has more lenient VS16 handling (`last_measured_idx >= 0` vs `== idx - 1`), - # causing VS16 to incorrectly apply when separated by control chars (!) + # In ignore mode, BEL is stripped, so VS16 is adjacent to the smiley and applies correctly. # Multiply by 10 to exceed threshold - assert wcwidth.width(("\u263A\x07\uFE0F") * 10) == 20 # (smiley(1) + BEL(0) + VS16(+1)) * 10 (!) + assert wcwidth.width(("\u263A\x07\uFE0F") * 10) == 20 # (smiley(1) + BEL-stripped(0) + VS16(+1)) * 10 def test_width_long_horizontal_fastpath(): diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index 00fa177..b0af500 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -1,10 +1,7 @@ """This is a python implementation of clip().""" from __future__ import annotations -# std imports -import re - -from typing import Literal, Optional, NamedTuple +from typing import Literal, Optional # local from ._width import width @@ -13,33 +10,10 @@ _sgr_state_update, _sgr_state_is_active, _sgr_state_to_sequence) -from .escape_sequences import _SEQUENCE_CLASSIFY, _HORIZONTAL_CURSOR_MOVEMENT - -# OSC 8 hyperlink parsing (mirrors textwrap.py to avoid circular import) -_HYPERLINK_OPEN_RE = re.compile(r'\x1b]8;([^;]*);([^\x07\x1b]*)(\x07|\x1b\\)') -_HYPERLINK_CLOSE_RE = re.compile(r'\x1b]8;;(?:\x07|\x1b\\)') - - -class _HyperlinkState(NamedTuple): - """Open OSC 8 hyperlink: url, params, terminator (BEL or ST).""" - - url: str - params: str - terminator: str - - -def _parse_hyperlink_open(seq: str) -> Optional[_HyperlinkState]: - if (m := _HYPERLINK_OPEN_RE.match(seq)): - return _HyperlinkState(url=m.group(2), params=m.group(1), terminator=m.group(3)) - return None - - -def _make_hyperlink_open(state: _HyperlinkState) -> str: - return f'\x1b]8;{state.params};{state.url}{state.terminator}' - - -def _make_hyperlink_close(terminator: str) -> str: - return f'\x1b]8;;{terminator}' +from .escape_sequences import (_SEQUENCE_CLASSIFY, _HORIZONTAL_CURSOR_MOVEMENT, + _HYPERLINK_OPEN_RE, _HYPERLINK_CLOSE_RE, + _HyperlinkState, _parse_hyperlink_open, + _make_hyperlink_open, _make_hyperlink_close) def _find_hyperlink_close(text: str, open_end: int) -> Optional[tuple[int, int]]: @@ -212,7 +186,7 @@ def _process_hyperlink( action='empty' -> data is close_end (skip entirely) action='outside' -> data is (inner_width, close_end) (advance col, skip) action='visible' -> data is (open_seq, clipped_inner, close_seq, - inner_width, hl_col_end, close_end) + inner_width, clipped_width, hl_col_end, close_end) """ close_span = _find_hyperlink_close(text, match_end) if close_span is None: @@ -245,11 +219,18 @@ def _process_hyperlink( control_codes=control_codes, ) + # Compute clipped width once here; avoids a second width() call in the painter path. + clipped_width = width( + clipped_inner, control_codes=control_codes, + tabsize=tabsize, ambiguous_width=ambiguous_width, + ) + return ('visible', ( - _make_hyperlink_open(hl_state), + _make_hyperlink_open(hl_state.url, hl_state.params, hl_state.terminator), clipped_inner, _make_hyperlink_close(hl_state.terminator), inner_width, + clipped_width, hl_col_end, close_end, )) @@ -358,7 +339,7 @@ def _apply_sgr_wrap(result: str) -> str: col += inner_width idx = close_end else: # 'visible' - open_seq, clipped_inner, close_seq, inner_width, _, close_end = data + open_seq, clipped_inner, close_seq, inner_width, clipped_width, _, close_end = data output.append(open_seq) output.append(clipped_inner) output.append(close_seq) @@ -465,15 +446,11 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: col += inner_width idx = close_end else: # 'visible' - open_seq, clipped_inner, close_seq, inner_width, hl_col_end, close_end = data + open_seq, clipped_inner, close_seq, inner_width, clipped_width, hl_col_end, close_end = data _append_seq(open_seq) - inner_clipped_width = width( - clipped_inner, control_codes=control_codes, - tabsize=tabsize, ambiguous_width=ambiguous_width, - ) - _write_cells(clipped_inner, inner_clipped_width, col, + _write_cells(clipped_inner, clipped_width, col, is_hyperlink=True) - col += inner_clipped_width + col += clipped_width _append_seq(close_seq, at_col=col) # Advance past the original hyperlink content col = hl_col_end @@ -576,4 +553,4 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: for _, seq_text in seqs_by_col[c]: parts.append(seq_text) - return _apply_sgr_wrap(''.join(parts)) \ No newline at end of file + return _apply_sgr_wrap(''.join(parts)) diff --git a/wcwidth/_constants.py b/wcwidth/_constants.py index 5505ef5..7c2b627 100644 --- a/wcwidth/_constants.py +++ b/wcwidth/_constants.py @@ -62,4 +62,4 @@ _ZERO_WIDTH_TABLE = ZERO_WIDTH[_LATEST_VERSION] _WIDE_EASTASIAN_TABLE = WIDE_EASTASIAN[_LATEST_VERSION] -_AMBIGUOUS_TABLE = AMBIGUOUS_EASTASIAN[next(iter(AMBIGUOUS_EASTASIAN))] +_AMBIGUOUS_TABLE = AMBIGUOUS_EASTASIAN[_LATEST_VERSION] diff --git a/wcwidth/_wcswidth.py b/wcwidth/_wcswidth.py index 6eb7edd..4625b12 100644 --- a/wcwidth/_wcswidth.py +++ b/wcwidth/_wcswidth.py @@ -14,6 +14,118 @@ from .table_grapheme import ISC_CONSONANT +class GraphemeMeasurer: + """Stateful measurer for grapheme-aware character width. + + Encapsulates the lookbehind state that must be threaded through + sequential per-character measurements by :meth:`measure_at`. + + Callers that interleave escape sequences or control codes between + characters should call :meth:`reset_adjacency` to prevent VS16 + from applying across the gap. + """ + + def __init__(self, text: str, end: int, wcwidth_fn) -> None: + self._text = text + self._end = end + self._wcwidth_fn = wcwidth_fn + self._last_measured_idx = -2 + self._last_measured_ucs = -1 + self._last_was_virama = False + self.conjunct_pending = False + + def measure_at(self, idx: int) -> tuple[int, int]: + """Process character at ``text[idx]`` and return ``(next_idx, width)``. + + Handles ZWJ, VS16, Regional Indicators, Fitzpatrick modifiers, virama + conjunct formation, Mc spacing marks, and standard ``wcwidth`` measurement. + + ``width`` is ``-1`` for C0/C1 control characters (caller must handle). + Callers that never pass C0/C1 characters will always receive ``width >= 0``. + """ + char = self._text[idx] + ucs = ord(char) + + # ZWJ (U+200D) + if ucs == 0x200D: + if self._last_was_virama: + return (idx + 1, 0) + if idx + 1 < self._end: + # Emoji ZWJ: skip next character unconditionally. + # Preserve _last_measured_idx so VS16 checks the emoji base + # (narrow bases get +1, wide bases are already 2 cells). + self._last_was_virama = False + return (idx + 2, 0) + self._last_was_virama = False + return (idx + 1, 0) + + # VS16 (U+FE0F): converts preceding narrow character to wide. + if ucs == 0xFE0F and self._last_measured_idx >= 0: + vs_width = bisearch( + ord(self._text[self._last_measured_idx]), + VS16_NARROW_TO_WIDE['9.0.0'], + ) + # Prevent double application; preserve emoji context (_last_measured_ucs stays) + self._last_measured_idx = -2 + return (idx + 1, vs_width) + + # Regional Indicator & Fitzpatrick (both above BMP) + if ucs > 0xFFFF: + if ucs in _REGIONAL_INDICATOR_SET: + # Lazy RI pairing: count preceding consecutive RIs + ri_before = 0 + j = idx - 1 + while j >= 0 and ord(self._text[j]) in _REGIONAL_INDICATOR_SET: + ri_before += 1 + j -= 1 + if ri_before % 2 == 1: + # Second RI in pair: zero width (pair = one 2-cell flag) + self._last_measured_ucs = ucs + return (idx + 1, 0) + # Fitzpatrick modifier: zero-width when following emoji base + elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] + and self._last_measured_ucs in _EMOJI_ZWJ_SET): + return (idx + 1, 0) + + # Virama conjunct formation + if self._last_was_virama and bisearch(ucs, ISC_CONSONANT): + self._last_measured_idx = idx + self._last_measured_ucs = ucs + self._last_was_virama = False + self.conjunct_pending = True + return (idx + 1, 0) + + # Normal character: measure with wcwidth + w = self._wcwidth_fn(char) + if w < 0: + # C0/C1 control character — caller must handle + return (idx + 1, -1) + if w > 0: + extra = 1 if self.conjunct_pending else 0 + self._last_measured_idx = idx + self._last_measured_ucs = ucs + self._last_was_virama = False + self.conjunct_pending = False + return (idx + 1, w + extra) + if self._last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE): + # Spacing Combining Mark (Mc) following a base character adds 1 + self._last_measured_idx = -2 + self._last_was_virama = False + self.conjunct_pending = False + return (idx + 1, 1) + self._last_was_virama = ucs in _ISC_VIRAMA_SET + return (idx + 1, 0) + + def reset_adjacency(self) -> None: + """Break VS16/Fitzpatrick adjacency. + + Call after processing escape sequences or control codes to prevent + VS16 and Fitzpatrick lookbehind from applying across the gap. + """ + self._last_measured_idx = -2 + self._last_measured_ucs = -1 + + def wcswidth( pwcs: str, n: typing.Union[int, None] = None, @@ -64,89 +176,12 @@ def wcswidth( end = len(pwcs) if n is None else n total_width = 0 idx = 0 - last_measured_idx = -2 # Track index of last measured char for VS16 - last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check) - last_was_virama = False # Virama conjunct formation state - conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc) + measurer = GraphemeMeasurer(pwcs, end, _wcwidth) while idx < end: - char = pwcs[idx] - ucs = ord(char) - if ucs == 0x200D: - if last_was_virama: - # ZWJ after virama requests explicit half-form rendering but - # does not change cell count -- consume ZWJ only, let the next - # consonant be handled by the virama conjunct rule. - idx += 1 - elif idx + 1 < end: - # Emoji ZWJ: skip next character unconditionally. - idx += 2 - last_was_virama = False - else: - idx += 1 - last_was_virama = False - continue - if ucs == 0xFE0F and last_measured_idx >= 0: - # VS16 following a measured character: add 1 if that character is - # known to be converted from narrow to wide by VS16. - total_width += bisearch(ord(pwcs[last_measured_idx]), VS16_NARROW_TO_WIDE["9.0.0"]) - last_measured_idx = -2 # Prevent double application - # VS16 preserves emoji context: last_measured_ucs stays as the base - idx += 1 - continue - # Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+) - if ucs > 0xFFFF: - if ucs in _REGIONAL_INDICATOR_SET: - # Lazy RI pairing: count preceding consecutive RIs only when the last one is - # received, because RI's are received so rarely its better than per-loop tracking of - # 'last char was an RI'. - ri_before = 0 - j = idx - 1 - while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET: - ri_before += 1 - j -= 1 - if ri_before % 2 == 1: - # Second RI in pair: contributes 0 (pair = one 2-cell flag) using an even-or-odd - # check to determine, 'CAUS' would be two flags, but 'CAU' would be 1 flag - # and wide 'U'. - idx += 1 - last_measured_ucs = ucs - continue - # First or unpaired RI: measured normally (width 2 from table) - # Fitzpatrick modifier: zero-width when following emoji base - elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] - and last_measured_ucs in _EMOJI_ZWJ_SET): - idx += 1 - continue - # Virama conjunct formation: consonant following virama contributes 0 width. - # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category - if last_was_virama and bisearch(ucs, ISC_CONSONANT): - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - conjunct_pending = True - idx += 1 - continue - wcw = _wcwidth(char) - if wcw < 0: - # early return -1 on C0 and C1 control characters - return wcw - if wcw > 0: - if conjunct_pending: - total_width += 1 - conjunct_pending = False - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - elif last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE): - # Spacing Combining Mark (Mc) following a base character adds 1 - wcw = 1 - last_measured_idx = -2 - last_was_virama = False - conjunct_pending = False - else: - last_was_virama = ucs in _ISC_VIRAMA_SET - total_width += wcw - idx += 1 - if conjunct_pending: + idx, w = measurer.measure_at(idx) + if w < 0: + return -1 + total_width += w + if measurer.conjunct_pending: total_width += 1 return total_width diff --git a/wcwidth/_width.py b/wcwidth/_width.py index 5b81400..478832a 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -4,16 +4,8 @@ # local from ._wcwidth import wcwidth -from .bisearch import bisearch -from ._wcswidth import wcswidth -from ._constants import (_EMOJI_ZWJ_SET, - _ISC_VIRAMA_SET, - _CATEGORY_MC_TABLE, - _FITZPATRICK_RANGE, - _REGIONAL_INDICATOR_SET) -from .table_vs16 import VS16_NARROW_TO_WIDE +from ._wcswidth import wcswidth, GraphemeMeasurer from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL -from .table_grapheme import ISC_CONSONANT from .escape_sequences import (_SEQUENCE_CLASSIFY, CURSOR_MOVEMENT_SEQUENCE, INDETERMINATE_EFFECT_SEQUENCE, @@ -139,16 +131,13 @@ def width( current_col = 0 max_extent = 0 idx = 0 - last_measured_idx = -2 # Track index of last measured char for VS16; -2 can never match idx-1 - last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check) - last_was_virama = False # Virama conjunct formation state - conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc) text_len = len(text) # Select wcwidth call pattern for best lru_cache performance: # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) + measurer = GraphemeMeasurer(text, text_len, _wcwidth) while idx < text_len: char = text[idx] @@ -188,6 +177,8 @@ def width( current_col = max(0, current_col - n_backward) # 2d. SGR and other zero-width sequences -- no column advance idx = m.end() + # Escape sequences break VS16 adjacency: reset last-measured state + measurer.reset_adjacency() max_extent = max(max_extent, current_col) continue @@ -196,12 +187,14 @@ def width( if strict: raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}") idx += 1 + measurer.reset_adjacency() continue if char in VERTICAL_CTRL: if strict: raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}") idx += 1 + measurer.reset_adjacency() continue # 3. Horizontal movement characters @@ -220,93 +213,22 @@ def width( current_col = 0 max_extent = max(max_extent, current_col) idx += 1 + measurer.reset_adjacency() continue - # 4. Zero-Width Joiner (ZWJ) - if char == '\u200D': - if last_was_virama: - # ZWJ after virama requests explicit half-form rendering but - # does not change cell count -- consume ZWJ only, let the next - # consonant be handled by the virama conjunct rule. - idx += 1 - elif idx + 1 < text_len: - # Emoji ZWJ: skip next character unconditionally. - idx += 2 - last_was_virama = False - else: - idx += 1 - last_was_virama = False - continue - - # 5. Other zero-width characters (control chars) + # 4. Zero-width control characters if char in ZERO_WIDTH_CTRL: idx += 1 + measurer.reset_adjacency() continue - ucs = ord(char) - - # 6. VS16: converts preceding narrow character to wide - if ucs == 0xFE0F: - if last_measured_idx == idx - 1: - if bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE["9.0.0"]): - current_col += 1 - max_extent = max(max_extent, current_col) - # VS16 preserves emoji context: last_measured_ucs stays as the base - idx += 1 - continue - - # 6b. Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+) - if ucs > 0xFFFF: - if ucs in _REGIONAL_INDICATOR_SET: - # Lazy RI pairing: count preceding consecutive RIs - ri_before = 0 - j = idx - 1 - while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET: - ri_before += 1 - j -= 1 - if ri_before % 2 == 1: - last_measured_ucs = ucs - idx += 1 - continue - # 6c. Fitzpatrick modifier: zero-width when following emoji base - elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] - and last_measured_ucs in _EMOJI_ZWJ_SET): - idx += 1 - continue - - # 7. Virama conjunct formation: consonant following virama contributes 0 width. - # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category - if last_was_virama and bisearch(ucs, ISC_CONSONANT): - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - conjunct_pending = True - idx += 1 - continue - - # 8. Normal characters: measure with wcwidth - w = _wcwidth(char) + # 5. ZWJ, VS16, Regional Indicators, Fitzpatrick, Virama conjuncts, Mc, wcwidth + idx, w = measurer.measure_at(idx) if w > 0: - if conjunct_pending: - current_col += 1 - conjunct_pending = False current_col += w max_extent = max(max_extent, current_col) - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - elif last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE): - # Spacing Combining Mark (Mc) following a base character adds 1 - current_col += 1 - max_extent = max(max_extent, current_col) - last_measured_idx = -2 - last_was_virama = False - conjunct_pending = False - else: - last_was_virama = ucs in _ISC_VIRAMA_SET - idx += 1 - - if conjunct_pending: + + if measurer.conjunct_pending: current_col += 1 max_extent = max(max_extent, current_col) return max_extent diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index afa8c43..b6a6680 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -10,10 +10,38 @@ import re import typing +from typing import Optional, NamedTuple # local from .sgr_state import _SGR_PATTERN +_HYPERLINK_OPEN_RE = re.compile(r'\x1b]8;([^;]*);([^\x07\x1b]*)(\x07|\x1b\\)') +_HYPERLINK_CLOSE_RE = re.compile(r'\x1b]8;;(?:\x07|\x1b\\)') + +class _HyperlinkState(NamedTuple): + """Open OSC 8 hyperlink: url, params, terminator (BEL or ST).""" + + url: str + params: str + terminator: str + + +def _parse_hyperlink_open(seq: str) -> Optional[_HyperlinkState]: + """Parse OSC 8 open sequence; return state or None.""" + if (m := _HYPERLINK_OPEN_RE.match(seq)): + return _HyperlinkState(url=m.group(2), params=m.group(1), terminator=m.group(3)) + return None + + +def _make_hyperlink_open(url: str, params: str, terminator: str) -> str: + """Generate OSC 8 open sequence.""" + return f'\x1b]8;{params};{url}{terminator}' + + +def _make_hyperlink_close(terminator: str) -> str: + """Generate OSC 8 close sequence.""" + return f'\x1b]8;;{terminator}' + # Zero-width escape sequences (SGR, OSC, CSI, etc.). This table, like INDETERMINATE_EFFECT_SEQUENCE, # originated from the 'blessed' library. ZERO_WIDTH_PATTERN = re.compile( diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index 7a40d06..850e4c2 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -8,51 +8,24 @@ from __future__ import annotations # std imports -import re import secrets import textwrap -from typing import TYPE_CHECKING, Optional, NamedTuple +from typing import TYPE_CHECKING, Optional # local from ._width import width as wcwidth_width from .grapheme import iter_graphemes from .sgr_state import propagate_sgr as _propagate_sgr -from .escape_sequences import ZERO_WIDTH_PATTERN, iter_sequences +from .escape_sequences import (ZERO_WIDTH_PATTERN, iter_sequences, + _HYPERLINK_OPEN_RE, _HyperlinkState, + _parse_hyperlink_open, _make_hyperlink_open, + _make_hyperlink_close) if TYPE_CHECKING: # pragma: no cover from typing import Any, Literal -class _HyperlinkState(NamedTuple): - """State for tracking an open OSC 8 hyperlink across line breaks.""" - - url: str # hyperlink target URL - params: str # id=xxx and other key=value pairs separated by : - terminator: str # BEL (\x07) or ST (\x1b\\) - - -# Hyperlink parsing: captures (params, url, terminator) -_HYPERLINK_OPEN_RE = re.compile(r'\x1b]8;([^;]*);([^\x07\x1b]*)(\x07|\x1b\\)') - - -def _parse_hyperlink_open(seq: str) -> Optional[_HyperlinkState]: - """Parse OSC 8 open sequence, return state or None.""" - if (m := _HYPERLINK_OPEN_RE.match(seq)): - return _HyperlinkState(url=m.group(2), params=m.group(1), terminator=m.group(3)) - return None - - -def _make_hyperlink_open(url: str, params: str, terminator: str) -> str: - """Generate OSC 8 open sequence.""" - return f'\x1b]8;{params};{url}{terminator}' - - -def _make_hyperlink_close(terminator: str) -> str: - """Generate OSC 8 close sequence.""" - return f'\x1b]8;;{terminator}' - - class SequenceTextWrapper(textwrap.TextWrapper): """ Sequence-aware text wrapper extending :class:`textwrap.TextWrapper`. From da498b72cfba7268aa1d66800ef5159e5647d232 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 11:58:49 -0400 Subject: [PATCH 53/70] big update, no whammies, let's check the performance .. --- docs/api.rst | 4 + docs/intro.rst | 6 +- docs/specs.rst | 8 +- tests/test_benchmarks.py | 42 ++ tests/test_clip.py | 31 +- tests/test_clip_cursors.py | 71 ++- tests/test_width.py | 10 +- wcwidth/__init__.py | 4 +- wcwidth/_clip.py | 870 ++++++++++++++++++++---------------- wcwidth/_wcswidth.py | 27 +- wcwidth/_width.py | 2 +- wcwidth/escape_sequences.py | 28 -- wcwidth/textwrap.py | 41 +- 13 files changed, 669 insertions(+), 475 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 55d288b..901b019 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -36,4 +36,8 @@ requirements.txt or equivalent. Their signatures will never change. .. autofunction:: wcwidth.list_versions +.. autofunction:: wcwidth.Hyperlink + +.. autofunction:: wcwidth.HyperlinkParams + .. _SEMVER: https://semver.org diff --git a/docs/intro.rst b/docs/intro.rst index 39adb0a..4deb5df 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -524,9 +524,9 @@ History * **New** `clip()`_ parameter ``control_codes='parse'``, ``'ignore'``, and ``'strict'``. `clip()`_ is now able to clip OSC 8 hyperlinks. * **Improved** `clip()`_ and `width()`_ to support horizontal cursor sequences (``cub``, ``cuf``, - ``hpa``). Cursor-left (``cub``) can now overwrite previous text, matching terminal behavior. - ``column_address`` (``hpa``) and carriage return (``\r``) are now parsed, and some values - conditionally raise ``ValueError`` when ``control_codes='parse'``. + ``hpa``). Cursor-left (``cub``) or backspace (``\b``) now overwrites text. ``column_address`` + (``hpa``) and carriage return (``\r``) are now parsed, and some values conditionally raise + ``ValueError`` when ``control_codes='parse'``. 0.6.0 *2026-02-06* * **New** Parameters ``expand_tabs``, ``replace_whitespace``, ``fix_sentence_endings``, diff --git a/docs/specs.rst b/docs/specs.rst index 098699f..1182b38 100644 --- a/docs/specs.rst +++ b/docs/specs.rst @@ -108,7 +108,7 @@ character when measured in sequence by :func:`wcwidth.wcswidth` or :func:`wcwidth.width`. The ``Mc`` character adds +1 to the total width, reflecting its *positive advance width* as defined in `General Category`_ (Table 4-4). Zero-width combining marks (``Mn``) between the base character -and the ``Mc`` do not break the association — for example, a consonant followed +and the ``Mc`` do not break the association. For example, a consonant followed by a Nukta (``Mn``) and then a vowel sign (``Mc``) is measured as base + 1. Virama Conjunct Formation @@ -116,13 +116,13 @@ Virama Conjunct Formation In `Brahmic scripts`_, a `Virama`_ (``Indic_Syllabic_Category=Virama`` in `IndicSyllabicCategory.txt`_) between two consonants triggers `conjunct`_ -formation: the font engine merges the consonants into a single ligature glyph. +formation: the consonants are merged into a single ligature glyph. - A ``Consonant`` immediately following a ``Virama`` contributes 0 width. -- The conjunct still occupies cells — the next visible advance settles it: +- The conjunct still occupies cells and the next visible advance settles it: - A following ``Mc`` (`Spacing Combining Mark`_, e.g. a vowel sign) counts as - 1 cell and closes the conjunct — no extra cell is added. + 1 cell and closes the conjunct. - A following character with positive width (or end of string) adds 1 cell for the conjunct before counting its own width. diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 7b8b2b9..76bd06c 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -293,6 +293,48 @@ def test_clip_complex_sgr(benchmark): benchmark(wcwidth.clip, text, 6, 11) +def test_clip_long_cjk_past_window(benchmark): + """Benchmark clip() with long CJK text, narrow window (early-exit path).""" + text = '中文测试字符串' * 100 # 700 chars, no escape sequences + benchmark(wcwidth.clip, text, 0, 50) + + +def test_clip_dense_ansi_past_window(benchmark): + """Benchmark clip() with dense ANSI sequences past clip window (SGR tracking).""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30) + + +def test_clip_dense_ansi_no_propagate(benchmark): + """Benchmark clip() with dense ANSI sequences, SGR propagation disabled.""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, propagate_sgr=False) + + +def test_clip_osc8_hyperlinks(benchmark): + """Benchmark clip() with OSC 8 hyperlinks (hyperlink parsing path).""" + text = '\x1b]8;;http://example.com\x07Click Here\x1b]8;;\x07 ' * 20 + benchmark(wcwidth.clip, text, 0, 80) + + +def test_clip_cursor_cr_overwrite(benchmark): + """Benchmark clip() with carriage-return overwrite (painter path).""" + text = 'hello\rworld ' * 20 + benchmark(wcwidth.clip, text, 0, 50) + + +def test_clip_cursor_csi_backward(benchmark): + """Benchmark clip() with CSI cursor-backward sequences (painter path).""" + text = 'hello\x1b[2Dxy ' * 20 + benchmark(wcwidth.clip, text, 0, 40) + + +def test_clip_long_ascii_fastpath(benchmark): + """Benchmark clip() with long ASCII string (fast-path slice).""" + text = 'hello world ' * 1000 + benchmark(wcwidth.clip, text, 500, 600) + + def test_propagate_sgr_multiline(benchmark): """Benchmark propagate_sgr() with multiple lines.""" lines = ['\x1b[1;31mline one', 'line two', 'line three\x1b[0m'] diff --git a/tests/test_clip.py b/tests/test_clip.py index 9e51851..1a4d78f 100644 --- a/tests/test_clip.py +++ b/tests/test_clip.py @@ -191,12 +191,24 @@ def test_clip_sequences_osc_hyperlink(): f'{OSC_START_BEL}\x1b[31mlink\x1b[0m{OSC_END_BEL}'), # Hyperlink open without matching close -- preserved as regular sequence ('\x1b]8;;http://x.com\x07link', 0, 4, '\x1b]8;;http://x.com\x07link'), - # Nested hyperlinks - ('\x1b]8;;a\x07ABCD \x1b]8;;b\x07XY\x1b]8;;\x07 EF\x1b]8;;\x07', 0, 14, - '\x1b]8;;a\x07ABCD \x1b]8;;b\x07XY\x1b]8;;\x07 EF\x1b]8;;\x07'), # Bare ESC between hyperlink markers ('\x1b]8;;url\x07ab\x1bxcd\x1b]8;;\x07', 0, 6, '\x1b]8;;url\x07ab\x1bxcd\x1b]8;;\x07'), + # Per OSC 8 spec "A note on opening/closing hyperlinks": terminal + # emulators treat hyperlinks as a state attribute, not nested anchors. + # Opening a new hyperlink replaces the current one; a single close + # terminates the hyperlink regardless of how many opens preceded it. + # + # Two opens, one close: URL "b" replaces "a", close terminates. + ('\x1b]8;;a\x07AB\x1b]8;;b\x07CD\x1b]8;;\x07EF', 0, 6, + '\x1b]8;;a\x07AB\x1b]8;;b\x07CD\x1b]8;;\x07EF'), + # URL switch without closing: "b" replaces "a", no close in input. + ('\x1b]8;;a\x07AB\x1b]8;;b\x07CD', 0, 4, + '\x1b]8;;a\x07AB\x1b]8;;b\x07CD'), + # Multiple opens, close, bare close: "b" replaces "a", first close + # terminates, trailing close is harmless (closing when not open). + ('\x1b]8;;a\x07ABCD \x1b]8;;b\x07XY\x1b]8;;\x07 EF\x1b]8;;\x07', 0, 10, + '\x1b]8;;a\x07ABCD \x1b]8;;b\x07XY\x1b]8;;\x07 EF\x1b]8;;\x07'), ] @@ -221,9 +233,16 @@ def test_clip_hyperlink_control_codes_overwrite(control_codes, start, end, expec def test_clip_osc_hyperlink_strict_raises(): - """control_codes='strict' raises ValueError when overwriting hyperlink cells.""" - with pytest.raises(ValueError, match='OSC 8 hyperlink'): - clip(_HLINK_OVERWRITE, 0, 4, control_codes='strict') + """ + control_codes='strict' allows hyperlink-cursor interactions. + + Overwriting hyperlink cells causes corrupted "run on" hyperlinks in practical + testing with kitty, presumably the hiddden "end hyperlink" is not found, in + any case, we make no attempt to parse overwrite of hyperlinks + """ + assert repr(clip(_HLINK_OVERWRITE, 0, 4, control_codes='strict')) == repr( + f'{OSC_START_BEL}link{OSC_END_BEL}' + ) # Painter-path hyperlink edge cases diff --git a/tests/test_clip_cursors.py b/tests/test_clip_cursors.py index 03ad940..d53bb34 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_cursors.py @@ -105,18 +105,83 @@ def test_clip_cursor_sequences_expected_behaviour(text, start, end, kwargs, expe def test_clip_cursor_left_strict_out_of_bounds(): - """clip() with control_codes='strict' raises on cursor-left beyond string start.""" + """Clip() with control_codes='strict' raises on cursor-left beyond string start.""" with pytest.raises(ValueError, match='Cursor left movement'): clip('a\x1b[5Da', 0, 1, control_codes='strict') def test_clip_cursor_left_strict_out_of_bounds_painter(): - """clip() strict-mode raises on cursor-left beyond start in painter path.""" + """Clip() strict-mode raises on cursor-left beyond start in painter path.""" with pytest.raises(ValueError, match='Cursor left movement'): clip('\x1b[2Dab', 0, 2, control_codes='strict') def test_clip_cursor_left_out_of_bounds_parse_no_raise(): - """clip() parse mode silently clamps cursor-left beyond start.""" + """Clip() parse mode silently clamps cursor-left beyond start.""" assert clip('a\x1b[5Da', 0, 1) == 'a' assert clip('ab\x1b[99Dcd', 0, 4) == 'cd' + + +# Indeterminate-effect sequences that raise ValueError in strict mode +# (matching width() behavior). + +INDETERMINATE_SEQUENCES = [ + ('\x1b[K', 'erase_in_line'), + ('\x1b[2K', 'erase_in_line_params'), + ('\x1b[J', 'erase_in_display'), + ('\x1b[2J', 'erase_in_display_params'), + ('\x1b[H', 'cursor_home'), + ('\x1b[1;1H', 'cursor_address'), + ('\x1b[A', 'cursor_up'), + ('\x1b[2A', 'cursor_up_params'), + ('\x1b[B', 'cursor_down'), + ('\x1b[5B', 'cursor_down_params'), + ('\x1b[P', 'delete_character'), + ('\x1b[1P', 'parm_dch'), + ('\x1b[M', 'delete_line'), + ('\x1b[1M', 'parm_delete_line'), + ('\x1b[L', 'insert_line'), + ('\x1b[1L', 'parm_insert_line'), + ('\x1b[@', 'insert_character'), + ('\x1b[1X', 'erase_chars'), + ('\x1b[S', 'scroll_up'), + ('\x1b[T', 'scroll_down'), + ('\x1b[?1049h', 'enter_fullscreen'), + ('\x1b[?1049l', 'exit_fullscreen'), + ('\x1bD', 'scroll_forward'), + ('\x1bM', 'scroll_reverse'), + ('\x1b8', 'restore_cursor'), + ('\x1bc', 'full_reset'), +] + + +@pytest.mark.parametrize('seq,cap_name', INDETERMINATE_SEQUENCES) +def test_clip_strict_indeterminate_raises(seq, cap_name): + """Clip() strict mode raises ValueError on indeterminate-effect sequences.""" + with pytest.raises(ValueError, match='Indeterminate cursor sequence'): + clip(f'hello{seq}world', 0, 10, control_codes='strict') + + +@pytest.mark.parametrize('seq,cap_name', INDETERMINATE_SEQUENCES) +def test_clip_parse_indeterminate_preserved(seq, cap_name): + """Clip() parse mode preserves indeterminate sequences as zero-width.""" + result = clip(f'hello{seq}world', 0, 10, control_codes='parse') + # The sequence is preserved, visible text is hello + world = 10 chars + assert 'hello' in result + assert 'world' in result + assert seq in result + + +def test_clip_strict_cr_allowed(): + """Carriage return is allowed in strict mode (text begins at column 0).""" + assert clip('hello\rworld', 0, 5, control_codes='strict') == 'world' + + +def test_clip_strict_hpa_allowed(): + """HPA is allowed in strict mode (text begins at column 0).""" + assert clip('abc\x1b[5Gde', 0, 10, control_codes='strict') == 'abc de' + + +def test_clip_strict_cursor_left_allowed(): + """Cursor-left within bounds is allowed in strict mode.""" + assert clip('hello\x1b[2Dxy', 0, 5, control_codes='strict') == 'helxy' diff --git a/tests/test_width.py b/tests/test_width.py index e0abe70..8e43b47 100644 --- a/tests/test_width.py +++ b/tests/test_width.py @@ -199,12 +199,12 @@ def test_vs16_selector(): def test_zwj_with_non_emoji_chars(): - """ZWJ with non-emoji characters and trailing VS16. + """ + ZWJ with non-emoji characters and trailing VS16. - These are invalid Unicode sequences (ZWJ followed by non-emoji), so - behavior is implementation-defined. The emoji base (smiley, width 1) - is narrow, and VS16 looks back to it across the ZWJ-consumed characters, - adding 1 cell for a total width of 2. + These are invalid Unicode sequences (ZWJ followed by non-emoji), so behavior is implementation- + defined. The emoji base (smiley, width 1) is narrow, and VS16 looks back to it across the ZWJ- + consumed characters, adding 1 cell for a total width of 2. """ # Control test, assert wcwidth.width("\u263A\uFE0F") == 2 # smiley + VS16 = 2 diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index d38e383..871b23e 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -17,6 +17,7 @@ from .grapheme import iter_graphemes, iter_graphemes_reverse, grapheme_boundary_before from .textwrap import SequenceTextWrapper, wrap from ._wcswidth import wcswidth +from .hyperlink import Hyperlink, HyperlinkParams from .sgr_state import propagate_sgr from .table_vs16 import VS16_NARROW_TO_WIDE from .table_wide import WIDE_EASTASIAN @@ -30,7 +31,8 @@ __all__ = ('wcwidth', 'wcswidth', 'width', 'iter_sequences', 'iter_graphemes', 'iter_graphemes_reverse', 'grapheme_boundary_before', 'ljust', 'rjust', 'center', 'wrap', 'clip', 'strip_sequences', - 'list_versions', 'propagate_sgr') + 'list_versions', 'propagate_sgr', + 'Hyperlink', 'HyperlinkParams') # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index b0af500..6755d1d 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -1,375 +1,330 @@ """This is a python implementation of clip().""" from __future__ import annotations -from typing import Literal, Optional +# std imports +import enum + +from typing import Literal, Optional, NamedTuple # local from ._width import width from .grapheme import iter_graphemes +from .hyperlink import Hyperlink, HyperlinkParams from .sgr_state import (_SGR_STATE_DEFAULT, _sgr_state_update, _sgr_state_is_active, _sgr_state_to_sequence) -from .escape_sequences import (_SEQUENCE_CLASSIFY, _HORIZONTAL_CURSOR_MOVEMENT, - _HYPERLINK_OPEN_RE, _HYPERLINK_CLOSE_RE, - _HyperlinkState, _parse_hyperlink_open, - _make_hyperlink_open, _make_hyperlink_close) +from .escape_sequences import (_SEQUENCE_CLASSIFY, + _HORIZONTAL_CURSOR_MOVEMENT, + INDETERMINATE_EFFECT_SEQUENCE) + + +class _ClipContext(NamedTuple): + """Immutable parameters for a clip operation.""" + text: str + start: int + end: int + fillchar: str + tabsize: int + ambiguous_width: int + control_codes: Literal['parse', 'strict', 'ignore'] + strict: bool + propagate_sgr: bool + + +class _HyperlinkAction(enum.Enum): + """Outcome of processing an OSC 8 hyperlink unit.""" + NO_CLOSE = enum.auto() # open sequence without matching close + EMPTY = enum.auto() # hyperlink with no visible inner text + OUTSIDE = enum.auto() # hyperlink entirely outside the clip window + VISIBLE = enum.auto() # hyperlink overlaps the clip window -def _find_hyperlink_close(text: str, open_end: int) -> Optional[tuple[int, int]]: +class _HyperlinkResult(NamedTuple): """ - Find matching OSC 8 close, handling nesting. + Result of processing an OSC 8 hyperlink. - Returns (start, end) or None. + Only the fields relevant to each action are populated. """ - depth = 1 - idx = open_end - while idx < len(text): - if text[idx] != '\x1b': - idx += 1 - continue - m = _SEQUENCE_CLASSIFY.match(text, idx) - if not m: - idx += 1 - continue - seq = m.group() - if _HYPERLINK_CLOSE_RE.match(seq): - depth -= 1 - if depth == 0: - return (idx, m.end()) - elif _parse_hyperlink_open(seq): - depth += 1 - idx = m.end() - return None + action: _HyperlinkAction + close_end: int = 0 + inner_width: int = 0 + open_seq: str = '' + clipped_inner: str = '' + close_seq: str = '' + clipped_width: int = 0 + hl_col_end: int = 0 -def clip( - text: str, - start: int, - end: int, - *, - fillchar: str = ' ', - tabsize: int = 8, - ambiguous_width: int = 1, - propagate_sgr: bool = True, - control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', -) -> str: - r""" - Clip text to display columns ``(start, end)`` while preserving all terminal sequences. - - This function extracts a substring based on visible column positions rather than - character indices. Terminal escape sequences are preserved in the output since - they have zero display width. If a wide character (width 2) would be split at - either boundary, it is replaced with ``fillchar``. +def _apply_sgr_wrap(result: str, sgr_at_clip_start: object) -> str: + """ + Apply SGR prefix/suffix around *result*. - TAB characters (``\t``) are expanded to spaces up to the next tab stop, - controlled by the ``tabsize`` parameter. + If an SGR state was captured at the first visible character, prefix the result with the + corresponding SGR sequence and suffix with a reset if any styles are active. + """ + if sgr_at_clip_start is not None: + if prefix := _sgr_state_to_sequence(sgr_at_clip_start): + result = prefix + result + if _sgr_state_is_active(sgr_at_clip_start): + result += '\x1b[0m' + return result + + +def _process_hyperlink( + ctx: _ClipContext, + params: HyperlinkParams, + match_end: int, + col: int, +) -> _HyperlinkResult: + """ + Process an OSC 8 hyperlink unit. - When no horizontal cursor movements are present (backspace, carriage return, or - CSI C/D/G sequences), cursor movement characters and sequences are passed through - as zero-width sequences. When cursor movement is detected, a "painter's - algorithm" is used instead: cursor movements actively change the write position, - allowing cursor-left and carriage return to overwrite previously written cells. + Finds the matching close sequence, measures the inner text width, and determines whether the + hyperlink is empty, outside the clip window, or visible (requiring inner-text clipping). + """ + close_start, close_end = Hyperlink.find_close(ctx.text, match_end) + if (close_start, close_end) == (-1, -1): + return _HyperlinkResult(_HyperlinkAction.NO_CLOSE) + inner_text = ctx.text[match_end:close_start] + inner_width = width( + inner_text, control_codes=ctx.control_codes, + tabsize=ctx.tabsize, ambiguous_width=ctx.ambiguous_width, + ) - **OSC 8 hyperlinks** are handled specially: the visible text inside a hyperlink - is clipped to the requested column range, and the hyperlink is rebuilt around - the clipped text. Empty hyperlinks (those with no remaining visible text after - clipping) are removed:: + if inner_width == 0: + return _HyperlinkResult(_HyperlinkAction.EMPTY, close_end=close_end) - >>> clip('\x1b]8;;http://example.com\x07Click This link\x1b]8;;\x07', 6, 10) - '\x1b]8;;http://example.com\x07This\x1b]8;;\x07' + hl_col_end = col + inner_width - :param text: String to clip, may contain terminal escape sequences. - :param start: Absolute starting column (inclusive, 0-indexed). - :param end: Absolute ending column (exclusive). - :param fillchar: Character to use when a wide character must be split at - a boundary (default space). Must have display width of 1. - :param tabsize: Tab stop width (default 8). Set to 0 to pass tabs through - as zero-width (preserved in output but don't advance column position). - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :param propagate_sgr: If True (default), SGR (terminal styling) sequences - are propagated. The result begins with any active style at the start - position and ends with a reset sequence if styles are active. - :param control_codes: How to handle control characters and sequences: + if hl_col_end <= ctx.start or col >= ctx.end: + return _HyperlinkResult(_HyperlinkAction.OUTSIDE, close_end=close_end, + inner_width=inner_width) - - ``'parse'`` (default): Track horizontal cursor movement and clip - hyperlink text. Cursor overwrite of hyperlink cells is allowed - (the hyperlink open/close are preserved as sequences). - - ``'strict'``: Like ``parse``, but raises :exc:`ValueError` when a - cursor movement would overwrite a cell that is part of an OSC 8 - hyperlink, as this produces indeterminate results on real terminals. - - ``'ignore'``: All control characters are treated as zero-width. - Cursor movement is not tracked (fastest path). + inner_clip_start = max(0, ctx.start - col) + inner_clip_end = ctx.end - col - :returns: Substring of ``text`` spanning display columns ``(start, end)``, - with all terminal sequences preserved and wide characters at boundaries - replaced with ``fillchar``. + clipped_inner = clip( + inner_text, inner_clip_start, inner_clip_end, + fillchar=ctx.fillchar, tabsize=ctx.tabsize, + ambiguous_width=ctx.ambiguous_width, + propagate_sgr=False, + control_codes=ctx.control_codes, + ) - :raises ValueError: If ``control_codes='strict'`` and a cursor movement - would overwrite a cell that was emitted as part of an OSC 8 hyperlink. + clipped_width = width( + clipped_inner, control_codes=ctx.control_codes, + tabsize=ctx.tabsize, ambiguous_width=ctx.ambiguous_width, + ) - SGR (terminal styling) sequences are propagated by default. The result - begins with any active style and ends with a reset:: + return _HyperlinkResult( + _HyperlinkAction.VISIBLE, + close_end=close_end, + inner_width=inner_width, + open_seq=params.make_open(), + clipped_inner=clipped_inner, + close_seq=params.make_close(), + clipped_width=clipped_width, + hl_col_end=hl_col_end, + ) - >>> clip('\x1b[1;34mHello world\x1b[0m', 6, 11) - '\x1b[1;34mworld\x1b[0m' - Set ``propagate_sgr=False`` to disable this behavior. +def _reconstruct_painter( + cells: dict[int, tuple[str, int]], + sequences: list[tuple[int, int, str]], + start: int, + end: int, + fillchar: str, +) -> str: + """ + Reconstruct the output string from painter's algorithm state. - .. versionadded:: 0.3.0 + Walks columns left-to-right, interleaving escape sequences and cell content, filling gaps with + *fillchar*. + """ + # Group and sort sequences by column, preserving insertion order within each. + seqs_by_col: dict[int, list[tuple[int, str]]] = {} + for col_pos, order, seq_text in sequences: + seqs_by_col.setdefault(col_pos, []).append((order, seq_text)) + for entries in seqs_by_col.values(): + entries.sort() - .. versionchanged:: 0.5.0 - Added ``propagate_sgr`` parameter (default True). + max_cell_col = max(cells.keys()) if cells else -1 + max_seq_col = max(seqs_by_col.keys()) if seqs_by_col else -1 + max_col = max(max_cell_col, max_seq_col) - .. versionchanged:: 0.7.0 - Added ``control_codes`` parameter and OSC 8 hyperlink-aware clipping. + parts: list[str] = [] + walk_col = 0 + col_limit = min(max_col, end) + while walk_col <= col_limit: + # Emit any sequences anchored at this column. + for _, seq_text in seqs_by_col.get(walk_col, ()): + parts.append(seq_text) - Example:: + if walk_col >= end: + walk_col += 1 + continue - >>> clip('hello world', 0, 5) - 'hello' - >>> clip('中文字', 0, 3) # Wide char split at column 3 - '中 ' - >>> clip('a\tb', 0, 10) # Tab expanded to spaces - 'a b' - """ - start = max(start, 0) - if end <= start: - return '' + if walk_col in cells: + cell_text, cell_w = cells[walk_col] + parts.append(cell_text) + walk_col += cell_w + else: + if start <= walk_col <= max_cell_col: + parts.append(fillchar) + walk_col += 1 - strict = control_codes == 'strict' + # Emit sequences anchored beyond the visible region. + for c in sorted(seqs_by_col.keys()): + if c > col_limit: + for _, seq_text in seqs_by_col[c]: + parts.append(seq_text) - # Fast path: printable ASCII only (no tabs, escape sequences, or wide or zero-width chars) - if text.isascii() and text.isprintable(): - return text[start:end] + return ''.join(parts) - # Fast path: no escape sequences means no SGR tracking needed - has_esc = '\x1b' in text - if propagate_sgr and not has_esc: - propagate_sgr = False - # Use painter's algorithm only when cursor movement (BS, CR, CSI C/D) can overwrite - # previously emitted cells. Text without any horizontal movement uses the fast simple path. - # Use direct char checks to avoid regex scan overhead for the common (no-cursor) case. - use_painter = ( - control_codes != 'ignore' and - ('\x08' in text or '\r' in text or - (has_esc and bool(_HORIZONTAL_CURSOR_MOVEMENT.search(text)))) - ) +def _clip_simple(ctx: _ClipContext) -> tuple[str, object]: + """ + Clip text without cursor movement (simple append-to-output path). - # SGR tracking state (only when propagate_sgr=True) sgr_at_clip_start is - # sgr state when first visible char emitted (None = not yet) + Returns ``(result, sgr_at_clip_start)``. The caller applies SGR wrapping. + """ + # Bind hot-path attributes to locals (LOAD_FAST instead of LOAD_ATTR). + _text = ctx.text + _end = ctx.end + _start = ctx.start + _propg = ctx.propagate_sgr + _ambw = ctx.ambiguous_width + _fillchar = ctx.fillchar + _tabsize = ctx.tabsize + _strict = ctx.strict + + output: list[str] = [] + col = 0 + idx = 0 sgr_at_clip_start = None - # current active sgr state - sgr = None # current SGR state, updated by SGR matches - if propagate_sgr: - sgr = _SGR_STATE_DEFAULT - - # Inner helpers - # Closure-based to avoid LOAD_GLOBAL overhead on hot-path calls. - # Each has low individual McCabe complexity. + sgr = _SGR_STATE_DEFAULT if _propg else None - def _mark_sgr_capture() -> None: - """Record SGR state at first visible emit, if not already captured.""" + def _mark() -> None: nonlocal sgr_at_clip_start - if propagate_sgr and sgr_at_clip_start is None: + if _propg and sgr_at_clip_start is None: sgr_at_clip_start = sgr - def _process_hyperlink( - hl_state: _HyperlinkState, match_end: int, col: int, - ) -> tuple[str, object]: - """Process OSC 8 hyperlink unit. - - Returns (action, data): - action='no_close' -> data unused (emit as regular seq, advance past match_end) - action='empty' -> data is close_end (skip entirely) - action='outside' -> data is (inner_width, close_end) (advance col, skip) - action='visible' -> data is (open_seq, clipped_inner, close_seq, - inner_width, clipped_width, hl_col_end, close_end) - """ - close_span = _find_hyperlink_close(text, match_end) - if close_span is None: - return ('no_close', None) - - close_start, close_end = close_span - inner_text = text[match_end:close_start] - inner_width = width( - inner_text, control_codes=control_codes, - tabsize=tabsize, ambiguous_width=ambiguous_width, - ) - - if inner_width == 0: - return ('empty', close_end) - - hl_col_start = col - hl_col_end = col + inner_width - - if hl_col_end <= start or hl_col_start >= end: - return ('outside', (inner_width, close_end)) - - inner_clip_start = max(0, start - col) - inner_clip_end = end - col - - clipped_inner = clip( - inner_text, inner_clip_start, inner_clip_end, - fillchar=fillchar, tabsize=tabsize, - ambiguous_width=ambiguous_width, - propagate_sgr=False, - control_codes=control_codes, - ) - - # Compute clipped width once here; avoids a second width() call in the painter path. - clipped_width = width( - clipped_inner, control_codes=control_codes, - tabsize=tabsize, ambiguous_width=ambiguous_width, - ) - - return ('visible', ( - _make_hyperlink_open(hl_state.url, hl_state.params, hl_state.terminator), - clipped_inner, - _make_hyperlink_close(hl_state.terminator), - inner_width, - clipped_width, - hl_col_end, - close_end, - )) - - def _emit_tab_simple(col: int, output: list[str]) -> int: - """Expand tab for simple-path, appending spaces to output list.""" - if tabsize > 0: - next_tab = col + (tabsize - (col % tabsize)) + def _emit_tab(col: int) -> int: + """Expand tab, appending spaces to output list.""" + if _tabsize > 0: + next_tab = col + (_tabsize - (col % _tabsize)) while col < next_tab: - if start <= col < end: + if _start <= col < _end: output.append(' ') - _mark_sgr_capture() + _mark() col += 1 else: output.append('\t') return col - def _emit_tab_painter(col: int, write_cells, append_seq) -> int: - """Expand tab for painter-path.""" - if tabsize > 0: - next_tab = col + (tabsize - (col % tabsize)) - while col < next_tab: - if start <= col < end: - write_cells(' ', 1, col) - col += 1 - else: - append_seq('\t') - return col - - def _handle_grapheme_simple( - grapheme: str, gw: int, col: int, output: list[str], - ) -> None: - """Emit grapheme to simple-path output list based on visibility.""" + def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: + """Emit grapheme to output list based on visibility.""" if gw == 0: - if start <= col < end: + if _start <= col < _end: output.append(grapheme) - elif col >= start and col + gw <= end: + elif col >= _start and col + gw <= _end: output.append(grapheme) - _mark_sgr_capture() - elif col < end and col + gw > start: - output.append(fillchar * (min(end, col + gw) - max(start, col))) - _mark_sgr_capture() - - def _handle_grapheme_painter( - grapheme: str, gw: int, col: int, write_cells, append_seq, - ) -> None: - """Emit grapheme to painter-path based on visibility.""" - if gw == 0: - if start <= col < end: - append_seq(grapheme) - elif col >= start and col + gw <= end: - write_cells(grapheme, gw, col) - elif col < end and col + gw > start: - clip_start = max(start, col) - for offset in range(min(end, col + gw) - clip_start): - write_cells(fillchar, 1, clip_start + offset) - - def _apply_sgr_wrap(result: str) -> str: - """Apply SGR prefix/suffix around result.""" - if sgr_at_clip_start is not None: - if prefix := _sgr_state_to_sequence(sgr_at_clip_start): - result = prefix + result - if _sgr_state_is_active(sgr_at_clip_start): - result += '\x1b[0m' - return result - - # Main loops - - if not use_painter: - # Simple path: no cursor movement - output: list[str] = [] - col = 0 - idx = 0 - - while idx < len(text): - char = text[idx] - - # Early exit: past visible region, SGR captured, no escape ahead - if col >= end and sgr_at_clip_start is not None and char != '\x1b': + _mark() + elif col < _end and col + gw > _start: + output.append(_fillchar * (min(_end, col + gw) - max(_start, col))) + _mark() + + while idx < len(_text): + char = _text[idx] + + # Early exit: past visible region. + if col >= _end and char not in '\r\x08\t\x1b': + if sgr_at_clip_start is not None: break + if not _propg: + next_esc = _text.find('\x1b', idx + 1) + if next_esc == -1: + break + idx = next_esc + continue - # Handle escape sequences - if char == '\x1b': - m = _SEQUENCE_CLASSIFY.match(text, idx) - if not m: - output.append(char) - idx += 1 - continue - - # SGR handling: update state, don't emit sequence - if m.group('sgr_params') is not None and propagate_sgr and sgr: - sgr = _sgr_state_update(sgr, m.group()) - idx = m.end() - continue - - # OSC 8 hyperlink - if hl_state := _parse_hyperlink_open(m.group()): - action, data = _process_hyperlink(hl_state, m.end(), col) - if action == 'no_close': - output.append(m.group()) - idx = m.end() - elif action == 'empty': - idx = data - elif action == 'outside': - inner_width, close_end = data - col += inner_width - idx = close_end - else: # 'visible' - open_seq, clipped_inner, close_seq, inner_width, clipped_width, _, close_end = data - output.append(open_seq) - output.append(clipped_inner) - output.append(close_seq) - _mark_sgr_capture() - col += inner_width - idx = close_end - continue - - # Any other recognized sequence preserved as-is - output.append(m.group()) + if char == '\x1b': + m = _SEQUENCE_CLASSIFY.match(_text, idx) + if not m: + output.append(char) + idx += 1 + continue + + # SGR: update state, do not emit. + if m.group('sgr_params') is not None and _propg and sgr is not None: + sgr = _sgr_state_update(sgr, m.group()) idx = m.end() continue - # TAB expansion - if char == '\t': - col = _emit_tab_simple(col, output) - idx += 1 + # OSC 8 hyperlink. + if hl_state := HyperlinkParams.parse(m.group()): + r = _process_hyperlink(ctx, hl_state, m.end(), col) + if r.action is _HyperlinkAction.NO_CLOSE: + output.append(m.group()) + idx = m.end() + elif r.action is _HyperlinkAction.EMPTY: + idx = r.close_end + elif r.action is _HyperlinkAction.OUTSIDE: + col += r.inner_width + idx = r.close_end + else: + output.append(r.open_seq) + output.append(r.clipped_inner) + output.append(r.close_seq) + _mark() + col += r.inner_width + idx = r.close_end continue - # Grapheme clustering - grapheme = next(iter_graphemes(text, start=idx)) - grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) - _handle_grapheme_simple(grapheme, grapheme_w, col, output) - col += grapheme_w - idx += len(grapheme) + # Indeterminate-effect sequences: raise in strict mode. + seq = m.group() + if _strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): + raise ValueError( + f"Indeterminate cursor sequence at position {idx}, " + f"{seq!r}" + ) - result = _apply_sgr_wrap(''.join(output)) - return result + # Any other recognized sequence: preserve as-is. + output.append(seq) + idx = m.end() + continue + + if char == '\t': + col = _emit_tab(col) + idx += 1 + continue + + grapheme = next(iter_graphemes(_text, start=idx)) + grapheme_w = width(grapheme, ambiguous_width=_ambw) + _handle_grapheme(grapheme, grapheme_w, col) + col += grapheme_w + idx += len(grapheme) + + return ''.join(output), sgr_at_clip_start + + +def _clip_painter(ctx: _ClipContext) -> tuple[str, object]: + """ + Clip text with cursor movement (painter's algorithm path). + + Returns ``(result, sgr_at_clip_start)``. The caller applies SGR wrapping. + """ + # Bind hot-path attributes to locals (LOAD_FAST instead of LOAD_ATTR). + _text = ctx.text + _end = ctx.end + _start = ctx.start + _propg = ctx.propagate_sgr + _ambw = ctx.ambiguous_width + _fillchar = ctx.fillchar + _tabsize = ctx.tabsize + _strict = ctx.strict - # Painter's algorithm path: handles cursor movement cells: dict[int, tuple[str, int]] = {} hyperlink_cells: set[int] = set() sequences: list[tuple[int, int, str]] = [] @@ -377,25 +332,24 @@ def _apply_sgr_wrap(result: str) -> str: col = 0 idx = 0 + sgr_at_clip_start = None + sgr = _SGR_STATE_DEFAULT if _propg else None + + def _mark() -> None: + nonlocal sgr_at_clip_start + if _propg and sgr_at_clip_start is None: + sgr_at_clip_start = sgr def _write_cells(s: str, w: int, write_col: int, is_hyperlink: bool = False) -> None: - nonlocal sgr_at_clip_start - if strict and not is_hyperlink: - for offset in range(w): - if write_col + offset in hyperlink_cells: - raise ValueError( - f"Cursor movement at column {write_col + offset} " - f"would overwrite an OSC 8 hyperlink cell. " - f"Use control_codes='parse' to allow this." - ) + """Write *w* cells of text *s* at *write_col*, handling wide-char splitting.""" for offset in range(w): src_col = write_col + offset if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: - cells[src_col - 1] = (fillchar, 1) + cells[src_col - 1] = (_fillchar, 1) hyperlink_cells.discard(src_col - 1) if cells.get(src_col, ('', 0))[1] == 2: - cells[src_col + 1] = (fillchar, 1) + cells[src_col + 1] = (_fillchar, 1) hyperlink_cells.discard(src_col + 1) cells.pop(src_col, None) hyperlink_cells.discard(src_col) @@ -403,81 +357,113 @@ def _write_cells(s: str, w: int, write_col: int, if is_hyperlink: for offset in range(w): hyperlink_cells.add(write_col + offset) - _mark_sgr_capture() + _mark() def _append_seq(seq: str, at_col: Optional[int] = None) -> None: + """Append a zero-width escape sequence anchored at the current column.""" nonlocal seq_order c = col if at_col is None else at_col sequences.append((c, seq_order, seq)) seq_order += 1 - _mark_sgr_capture() + _mark() - while idx < len(text): - char = text[idx] + def _emit_tab(col: int) -> int: + """Expand tab for painter-path.""" + if _tabsize > 0: + next_tab = col + (_tabsize - (col % _tabsize)) + while col < next_tab: + if _start <= col < _end: + _write_cells(' ', 1, col) + col += 1 + else: + _append_seq('\t') + return col - # Early exit: past visible region, SGR captured, no escape ahead - if col >= end and sgr_at_clip_start is not None and char != '\x1b': + def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: + """Emit grapheme to painter-path based on visibility.""" + if gw == 0: + if _start <= col < _end: + _append_seq(grapheme) + elif col >= _start and col + gw <= _end: + _write_cells(grapheme, gw, col) + elif col < _end and col + gw > _start: + clip_start = max(_start, col) + for offset in range(min(_end, col + gw) - clip_start): + _write_cells(_fillchar, 1, clip_start + offset) + + while idx < len(_text): + char = _text[idx] + + # Early exit: past visible region, SGR captured, no escape ahead. + # Note: unlike _clip_simple, we cannot skip past non-escape chars when + # propagate_sgr is False, because cursor movements (\r, \x08, CSI C/D) + # depend on accurate column tracking and may move back into the visible region. + if col >= _end and sgr_at_clip_start is not None and char != '\x1b': break - # 1. Handle escape sequences -- single regex dispatch if char == '\x1b': - m = _SEQUENCE_CLASSIFY.match(text, idx) + m = _SEQUENCE_CLASSIFY.match(_text, idx) if not m: _append_seq(char) idx += 1 continue - # SGR handling: update state, don't emit sequence - if m.group('sgr_params') is not None and propagate_sgr and sgr: + # SGR: update state, do not emit. + if m.group('sgr_params') is not None and _propg and sgr is not None: sgr = _sgr_state_update(sgr, m.group()) idx = m.end() continue - # OSC 8 hyperlink - if hl_state := _parse_hyperlink_open(m.group()): - action, data = _process_hyperlink(hl_state, m.end(), col) - if action == 'no_close': + # OSC 8 hyperlink. + if hl_state := HyperlinkParams.parse(m.group()): + r = _process_hyperlink(ctx, hl_state, m.end(), col) + if r.action is _HyperlinkAction.NO_CLOSE: _append_seq(m.group()) idx = m.end() - elif action == 'empty': - idx = data - elif action == 'outside': - inner_width, close_end = data - col += inner_width - idx = close_end - else: # 'visible' - open_seq, clipped_inner, close_seq, inner_width, clipped_width, hl_col_end, close_end = data - _append_seq(open_seq) - _write_cells(clipped_inner, clipped_width, col, + elif r.action is _HyperlinkAction.EMPTY: + idx = r.close_end + elif r.action is _HyperlinkAction.OUTSIDE: + col += r.inner_width + idx = r.close_end + else: + _append_seq(r.open_seq) + _write_cells(r.clipped_inner, r.clipped_width, col, is_hyperlink=True) - col += clipped_width - _append_seq(close_seq, at_col=col) - # Advance past the original hyperlink content - col = hl_col_end - idx = close_end + col += r.clipped_width + _append_seq(r.close_seq, at_col=col) + col = r.hl_col_end + idx = r.close_end continue - # 1a. HPA: horizontal position absolute (CSI n G) + # Indeterminate-effect sequences: raise in strict mode. + seq = m.group() + if _strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): + raise ValueError( + f"Indeterminate cursor sequence at position {idx}, " + f"{seq!r}" + ) + + # Horizontal Position Absolute (CSI n G). if (hpa_n := m.group('hpa_n')) is not None: col = int(hpa_n) - 1 if hpa_n else 0 idx = m.end() continue - # 1b. Cursor forward + # Cursor Forward (CSI n C). if (cforward_n := m.group('cforward_n')) is not None: n_forward = int(cforward_n) if cforward_n else 1 move_end = col + n_forward - if col < end and move_end > start: - for i in range(max(col, start), min(move_end, end)): - _write_cells(fillchar, 1, i) + if col < _end and move_end > _start: + for i in range(max(col, _start), min(move_end, _end)): + _write_cells(_fillchar, 1, i) col = move_end idx = m.end() continue - # 1c. Cursor backward + # Cursor Backward (CSI n D). if (cbackward_n := m.group('cbackward_n')) is not None: n_backward = int(cbackward_n) if cbackward_n else 1 - if strict and n_backward > col: + if _strict and n_backward > col: raise ValueError( f"Cursor left movement at position {idx} would move " f"{n_backward} cells left from column {col}, " @@ -487,70 +473,164 @@ def _append_seq(seq: str, at_col: Optional[int] = None) -> None: idx = m.end() continue - # 1d. Any other recognized zero-width sequence + # Any other recognized sequence: preserve as-is. _append_seq(m.group()) idx = m.end() continue - # 2. Carriage return and backspace (before TAB/grapheme fallthrough) + # Carriage return. if char == '\r': col = 0 idx += 1 continue + # Backspace. if char == '\x08': if col > 0: col -= 1 idx += 1 continue - # 3. TAB expansion + # Tab expansion. if char == '\t': - col = _emit_tab_painter(col, _write_cells, _append_seq) + col = _emit_tab(col) idx += 1 continue - # 4. Grapheme clustering - grapheme = next(iter_graphemes(text, start=idx)) - grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) - _handle_grapheme_painter(grapheme, grapheme_w, col, _write_cells, _append_seq) + # Grapheme cluster. + grapheme = next(iter_graphemes(_text, start=idx)) + grapheme_w = width(grapheme, ambiguous_width=_ambw) + _handle_grapheme(grapheme, grapheme_w, col) col += grapheme_w idx += len(grapheme) - # Reconstruct result from "painter's algorithm" - seqs_by_col: dict[int, list[tuple[int, str]]] = {} - for col_pos, order, seq_text in sequences: - seqs_by_col.setdefault(col_pos, []).append((order, seq_text)) - for entries in seqs_by_col.values(): - entries.sort() + result = _reconstruct_painter( + cells, sequences, _start, _end, _fillchar, + ) + return result, sgr_at_clip_start - max_cell_col = max(cells.keys()) if cells else -1 - max_seq_col = max(seqs_by_col.keys()) if seqs_by_col else -1 - max_col = max(max_cell_col, max_seq_col) - parts: list[str] = [] - walk_col = 0 - col_limit = min(max_col, end) - while walk_col <= col_limit: - for _, seq_text in seqs_by_col.get(walk_col, ()): - parts.append(seq_text) +def clip( + text: str, + start: int, + end: int, + *, + fillchar: str = ' ', + tabsize: int = 8, + ambiguous_width: int = 1, + propagate_sgr: bool = True, + control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', +) -> str: + r""" + Clip text to display columns ``(start, end)`` while preserving all terminal sequences. - if walk_col >= end: - walk_col += 1 - continue + This function extracts a substring based on visible column positions rather than + character indices. Terminal escape sequences are preserved in the output since + they have zero display width. If a wide character (width 2) is split at + either boundary, it is replaced with ``fillchar``. - if walk_col in cells: - cell_text, cell_w = cells[walk_col] - parts.append(cell_text) - walk_col += cell_w - else: - if start <= walk_col <= max_cell_col: - parts.append(fillchar) - walk_col += 1 + TAB characters (``\t``) are expanded to spaces up to the next tab stop, + controlled by the ``tabsize`` parameter. When cursor movement is detected, + a "painter's algorithm" is used, cursor movements actively change the write + position, allowing cursor-left and carriage return to overwrite previously + written cells. It is assumed that ``text`` begins at column 0. - for c in sorted(seqs_by_col.keys()): - if c > col_limit: - for _, seq_text in seqs_by_col[c]: - parts.append(seq_text) + **OSC 8 hyperlinks** are handled specially: the visible text inside a hyperlink + is clipped to the requested column range, and the hyperlink is rebuilt around + the clipped text. Empty hyperlinks (those with no remaining visible text after + clipping) are removed:: + + >>> clip('\x1b]8;;http://example.com\x07Click This link\x1b]8;;\x07', 6, 10) + '\x1b]8;;http://example.com\x07This\x1b]8;;\x07' + + :param text: String to clip, may contain terminal escape sequences. + :param start: Absolute starting column (inclusive, 0-indexed). + :param end: Absolute ending column (exclusive). + :param fillchar: Character to use when a wide character must be split at + a boundary (default space). Must have display width of 1. + :param tabsize: Tab stop width (default 8). Set to 0 to pass tabs through + as zero-width (preserved in output but don't advance column position). + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :param propagate_sgr: If True (default), SGR (terminal styling) sequences + are propagated. The result begins with any active style at the start + position and ends with a reset sequence if styles are active. + :param control_codes: How to handle control characters and sequences: + + - ``'parse'`` (default): Track horizontal cursor movement and clip + hyperlink text. Cursor overwrite is always allowed, with best effort + results; indeterminate sequences (home, clear, reset, etc.) are + preserved as zero-width. + - ``'strict'``: Like ``parse``, but raises :exc:`ValueError` on + sequences with indeterminate effects (cursor home, clear screen, + reset, vertical movement, etc.) matching :func:`width` behavior. + Also raises on out-of-bounds horizontal cursor movement. + - ``'ignore'``: All control characters are treated as zero-width. + Cursor movement is not tracked (fastest path). + + :returns: Substring of ``text`` spanning display columns ``(start, end)``, + with all terminal sequences preserved and wide characters at boundaries + replaced with ``fillchar``. + + :raises ValueError: If ``control_codes='strict'`` and an indeterminate-effect + sequence or out-of-bounds cursor movement is encountered. + + SGR (terminal styling) sequences are propagated by default. The result + begins with any active style and ends with a reset:: + + >>> clip('\x1b[1;34mHello world\x1b[0m', 6, 11) + '\x1b[1;34mworld\x1b[0m' + + Set ``propagate_sgr=False`` to disable this behavior. + + .. versionadded:: 0.3.0 + + .. versionchanged:: 0.5.0 + Added ``propagate_sgr`` parameter (default True). + + .. versionchanged:: 0.7.0 + Added ``control_codes`` parameter (default 'parse'). + OSC 8 hyperlink-aware clipping. + + Example:: + + >>> clip('hello world', 0, 5) + 'hello' + >>> clip('中文字', 0, 3) # Wide char split at column 3 + '中 ' + >>> clip('a\tb', 0, 10) # Tab expanded to spaces + 'a b' + """ + start = max(start, 0) + if end <= start: + return '' + + # Fast path: printable ASCII only. + if text.isascii() and text.isprintable(): + return text[start:end] + + # No escape sequences => no SGR tracking needed. + has_esc = '\x1b' in text + if propagate_sgr and not has_esc: + propagate_sgr = False + + # Use painter's algorithm only when cursor movement can overwrite cells. + fn_clip = _clip_painter if ( + control_codes != 'ignore' and + ('\x08' in text or '\r' in text or + (has_esc and bool(_HORIZONTAL_CURSOR_MOVEMENT.search(text)))) + ) else _clip_simple + + ctx = _ClipContext( + text=text, + start=start, + end=end, + fillchar=fillchar, + tabsize=tabsize, + ambiguous_width=ambiguous_width, + control_codes=control_codes, + strict=(control_codes == 'strict'), + propagate_sgr=propagate_sgr, + ) - return _apply_sgr_wrap(''.join(parts)) + return _apply_sgr_wrap(*fn_clip(ctx)) diff --git a/wcwidth/_wcswidth.py b/wcwidth/_wcswidth.py index 4625b12..cf4317f 100644 --- a/wcwidth/_wcswidth.py +++ b/wcwidth/_wcswidth.py @@ -1,6 +1,8 @@ """This is a python implementation of wcswidth().""" -import typing +from __future__ import annotations + +from typing import Callable, Optional # local from ._wcwidth import wcwidth @@ -15,7 +17,8 @@ class GraphemeMeasurer: - """Stateful measurer for grapheme-aware character width. + """ + Stateful measurer for grapheme-aware character width. Encapsulates the lookbehind state that must be threaded through sequential per-character measurements by :meth:`measure_at`. @@ -25,7 +28,7 @@ class GraphemeMeasurer: from applying across the gap. """ - def __init__(self, text: str, end: int, wcwidth_fn) -> None: + def __init__(self, text: str, end: int, wcwidth_fn: Callable[[str], int]) -> None: self._text = text self._end = end self._wcwidth_fn = wcwidth_fn @@ -35,7 +38,8 @@ def __init__(self, text: str, end: int, wcwidth_fn) -> None: self.conjunct_pending = False def measure_at(self, idx: int) -> tuple[int, int]: - """Process character at ``text[idx]`` and return ``(next_idx, width)``. + """ + Process character at ``text[idx]`` and return ``(next_idx, width)``. Handles ZWJ, VS16, Regional Indicators, Fitzpatrick modifiers, virama conjunct formation, Mc spacing marks, and standard ``wcwidth`` measurement. @@ -98,7 +102,7 @@ def measure_at(self, idx: int) -> tuple[int, int]: # Normal character: measure with wcwidth w = self._wcwidth_fn(char) if w < 0: - # C0/C1 control character — caller must handle + # C0/C1 control character (returns -1: caller should handle!) return (idx + 1, -1) if w > 0: extra = 1 if self.conjunct_pending else 0 @@ -117,10 +121,11 @@ def measure_at(self, idx: int) -> tuple[int, int]: return (idx + 1, 0) def reset_adjacency(self) -> None: - """Break VS16/Fitzpatrick adjacency. + """ + Break VS16/Fitzpatrick adjacency. - Call after processing escape sequences or control codes to prevent - VS16 and Fitzpatrick lookbehind from applying across the gap. + Call after processing escape sequences or control codes to prevent VS16 and Fitzpatrick + lookbehind from applying across the gap. """ self._last_measured_idx = -2 self._last_measured_ucs = -1 @@ -128,7 +133,7 @@ def reset_adjacency(self) -> None: def wcswidth( pwcs: str, - n: typing.Union[int, None] = None, + n: Optional[int] = None, unicode_version: str = 'auto', ambiguous_width: int = 1, ) -> int: @@ -168,9 +173,7 @@ def wcswidth( if n is None and pwcs.isascii() and pwcs.isprintable(): return len(pwcs) - # Select wcwidth call pattern for best lru_cache performance: - # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls - # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) + # Select wcwidth call pattern for best lru_cache performance _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) end = len(pwcs) if n is None else n diff --git a/wcwidth/_width.py b/wcwidth/_width.py index 478832a..42982c5 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -4,7 +4,7 @@ # local from ._wcwidth import wcwidth -from ._wcswidth import wcswidth, GraphemeMeasurer +from ._wcswidth import GraphemeMeasurer, wcswidth from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL from .escape_sequences import (_SEQUENCE_CLASSIFY, CURSOR_MOVEMENT_SEQUENCE, diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index b6a6680..afa8c43 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -10,38 +10,10 @@ import re import typing -from typing import Optional, NamedTuple # local from .sgr_state import _SGR_PATTERN -_HYPERLINK_OPEN_RE = re.compile(r'\x1b]8;([^;]*);([^\x07\x1b]*)(\x07|\x1b\\)') -_HYPERLINK_CLOSE_RE = re.compile(r'\x1b]8;;(?:\x07|\x1b\\)') - -class _HyperlinkState(NamedTuple): - """Open OSC 8 hyperlink: url, params, terminator (BEL or ST).""" - - url: str - params: str - terminator: str - - -def _parse_hyperlink_open(seq: str) -> Optional[_HyperlinkState]: - """Parse OSC 8 open sequence; return state or None.""" - if (m := _HYPERLINK_OPEN_RE.match(seq)): - return _HyperlinkState(url=m.group(2), params=m.group(1), terminator=m.group(3)) - return None - - -def _make_hyperlink_open(url: str, params: str, terminator: str) -> str: - """Generate OSC 8 open sequence.""" - return f'\x1b]8;{params};{url}{terminator}' - - -def _make_hyperlink_close(terminator: str) -> str: - """Generate OSC 8 close sequence.""" - return f'\x1b]8;;{terminator}' - # Zero-width escape sequences (SGR, OSC, CSI, etc.). This table, like INDETERMINATE_EFFECT_SEQUENCE, # originated from the 'blessed' library. ZERO_WIDTH_PATTERN = re.compile( diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index 850e4c2..be14ac5 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -16,11 +16,9 @@ # local from ._width import width as wcwidth_width from .grapheme import iter_graphemes +from .hyperlink import HyperlinkParams from .sgr_state import propagate_sgr as _propagate_sgr -from .escape_sequences import (ZERO_WIDTH_PATTERN, iter_sequences, - _HYPERLINK_OPEN_RE, _HyperlinkState, - _parse_hyperlink_open, _make_hyperlink_open, - _make_hyperlink_close) +from .escape_sequences import ZERO_WIDTH_PATTERN, iter_sequences if TYPE_CHECKING: # pragma: no cover from typing import Any, Literal @@ -214,7 +212,7 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m lines: list[str] = [] is_first_line = True - hyperlink_state: Optional[_HyperlinkState] = None + hyperlink_state: Optional[HyperlinkParams] = None # Track the id we're using for the current hyperlink continuation current_hyperlink_id: Optional[str] = None @@ -231,8 +229,11 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m # If continuing a hyperlink from previous line, prepend open sequence if hyperlink_state is not None: - open_seq = _make_hyperlink_open( - hyperlink_state.url, hyperlink_state.params, hyperlink_state.terminator) + open_seq = HyperlinkParams( + url=hyperlink_state.url, + params=hyperlink_state.params, + terminator=hyperlink_state.terminator, + ).make_open() chunks[-1] = open_seq + chunks[-1] # Drop leading whitespace (except at very start) @@ -311,20 +312,26 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m f'id={self._next_hyperlink_id()}:{new_state.params}') else: current_hyperlink_id = f'id={self._next_hyperlink_id()}' - line_content += _make_hyperlink_close(new_state.terminator) + line_content += HyperlinkParams(terminator=new_state.terminator, url='').make_close() # Also need to inject the id into the opening # sequence if it didn't have one if 'id=' not in new_state.params: # Find and replace the original open sequence with one that has id - old_open = _make_hyperlink_open( - new_state.url, new_state.params, new_state.terminator) - new_open = _make_hyperlink_open( - new_state.url, current_hyperlink_id, new_state.terminator) + old_open = HyperlinkParams( + url=new_state.url, + params=new_state.params, + terminator=new_state.terminator, + ).make_open() + new_open = HyperlinkParams( + url=new_state.url, + params=current_hyperlink_id, + terminator=new_state.terminator, + ).make_open() line_content = line_content.replace(old_open, new_open, 1) # Update state for next line, using computed id - hyperlink_state = _HyperlinkState( + hyperlink_state = HyperlinkParams( new_state.url, current_hyperlink_id, new_state.terminator) else: hyperlink_state = None @@ -348,8 +355,8 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m new_state = self._track_hyperlink_state( line_content, hyperlink_state) if new_state is not None: - line_content += _make_hyperlink_close( - new_state.terminator) + line_content += HyperlinkParams( + terminator=new_state.terminator, url='').make_close() lines.append(indent + line_content + self.placeholder) break current_width -= self._width(current_line[-1]) @@ -368,7 +375,7 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m def _track_hyperlink_state( self, text: str, - state: Optional[_HyperlinkState]) -> Optional[_HyperlinkState]: + state: Optional[HyperlinkParams]) -> Optional[HyperlinkParams]: """ Track hyperlink state through text. @@ -378,7 +385,7 @@ def _track_hyperlink_state( """ for segment, is_seq in iter_sequences(text): if is_seq: - parsed_link = _parse_hyperlink_open(segment) + parsed_link = HyperlinkParams.parse(segment) if parsed_link is not None and parsed_link.url: # has URL = open state = parsed_link elif segment.startswith(('\x1b]8;;\x1b\\', '\x1b]8;;\x07')): # close From 1746d13abb264e091e636e43d95d3d87b60c94c1 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 12:20:34 -0400 Subject: [PATCH 54/70] linting is out of control --- docs/intro.rst | 5 +++-- tests/test_clip.py | 32 +++++++++++++++++--------------- wcwidth/_clip.py | 13 ++++++++++--- wcwidth/_wcswidth.py | 2 ++ wcwidth/textwrap.py | 3 ++- 5 files changed, 34 insertions(+), 21 deletions(-) diff --git a/docs/intro.rst b/docs/intro.rst index 4deb5df..370833e 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -62,7 +62,8 @@ Discrepancies You may find that support *varies* for complex unicode sequences or codepoints. This library may be considered to presume the terminal is enabled for DEC Private Mode 2027 ("Grapheme Clustering"), but the specification does not fully describe varying unicode versions, feature levels, or details of -specific language support. This library does *not* support any alternate "legacy width" measurement. +specific language support. This library does *not* support any alternate "legacy width" +measurement. See `Grapheme Clusters and Terminal Emulators`_ and `terminal-unicode-core.tex`_, and `State of Terminal Emulators in 2025`_ for more details on Mode 2027 and unicode-aware terminals. @@ -142,7 +143,7 @@ Use function `width()`_ to measure a string with improved handling of ``control_ >>> # sequences with "indeterminate" effects like Home + Clear are zero-width >>> wcwidth.width('\x1b[H\x1b[2J') 0 - >>> # horizontal cursor movements are parsed, + >>> # horizontal cursor movements are parsed, >>> wcwidth.width('hello\b\b\b\b\bworld') 5 >>> wcwidth.width('hello\x1b[5Dworld') diff --git a/tests/test_clip.py b/tests/test_clip.py index 1a4d78f..2c17a11 100644 --- a/tests/test_clip.py +++ b/tests/test_clip.py @@ -218,11 +218,26 @@ def test_clip_osc_hyperlink_text_clipping(text, start, end, expected): assert repr(clip(text, start, end)) == repr(expected) -# control_codes variants with cursor movement into hyperlink +# Control_codes variants with cursor movement into hyperlink +# +# Overwriting hyperlink cells causes corrupted "run on" hyperlinks in practical +# testing with kitty, presumably the hidden "end hyperlink" sequence is +# overwritten, in any case, we make no attempt to parse overwrite of +# hyperlinks, we consider it a "glitch sequence _HLINK_OVERWRITE = f'{OSC_START_BEL}link{OSC_END_BEL}\x1b[2Dxy' CLIP_HYPERLINK_CONTROL_CODES_CASES = [ ('parse', 0, 4, f'{OSC_START_BEL}link{OSC_END_BEL}'), - ('ignore', 0, 6, f'{OSC_START_BEL}link{OSC_END_BEL}\x1b[2Dxy'), + ('parse', 0, 3, f'{OSC_START_BEL}lin{OSC_END_BEL}'), + ('parse', 0, 2, f'{OSC_START_BEL}li{OSC_END_BEL}'), + ('parse', 0, 1, f'{OSC_START_BEL}l{OSC_END_BEL}'), + # these next two are certainly "in error" + ('parse', 1, 4, f'{OSC_START_BEL}ink{OSC_END_BEL}y'), + ('parse', 1, 3, f'{OSC_START_BEL}in{OSC_END_BEL}x'), + ('parse', 1, 2, f'{OSC_START_BEL}i{OSC_END_BEL}'), + ('ignore', 0, 20, f'{_HLINK_OVERWRITE}'), + # and these two, 'xy' are missing entirely, also "in error" + ('parse', 0, 20, f'{OSC_START_BEL}link{OSC_END_BEL}'), + ('strict', 0, 20, f'{OSC_START_BEL}link{OSC_END_BEL}'), ] @@ -232,19 +247,6 @@ def test_clip_hyperlink_control_codes_overwrite(control_codes, start, end, expec assert repr(clip(_HLINK_OVERWRITE, start, end, control_codes=control_codes)) == repr(expected) -def test_clip_osc_hyperlink_strict_raises(): - """ - control_codes='strict' allows hyperlink-cursor interactions. - - Overwriting hyperlink cells causes corrupted "run on" hyperlinks in practical - testing with kitty, presumably the hiddden "end hyperlink" is not found, in - any case, we make no attempt to parse overwrite of hyperlinks - """ - assert repr(clip(_HLINK_OVERWRITE, 0, 4, control_codes='strict')) == repr( - f'{OSC_START_BEL}link{OSC_END_BEL}' - ) - - # Painter-path hyperlink edge cases CLIP_HYPERLINK_PAINTER_CASES = [ # Empty hyperlink dropped diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index 6755d1d..761ddf3 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -11,6 +11,7 @@ from .grapheme import iter_graphemes from .hyperlink import Hyperlink, HyperlinkParams from .sgr_state import (_SGR_STATE_DEFAULT, + _SGRState, _sgr_state_update, _sgr_state_is_active, _sgr_state_to_sequence) @@ -21,6 +22,7 @@ class _ClipContext(NamedTuple): """Immutable parameters for a clip operation.""" + text: str start: int end: int @@ -34,6 +36,7 @@ class _ClipContext(NamedTuple): class _HyperlinkAction(enum.Enum): """Outcome of processing an OSC 8 hyperlink unit.""" + NO_CLOSE = enum.auto() # open sequence without matching close EMPTY = enum.auto() # hyperlink with no visible inner text OUTSIDE = enum.auto() # hyperlink entirely outside the clip window @@ -46,6 +49,7 @@ class _HyperlinkResult(NamedTuple): Only the fields relevant to each action are populated. """ + action: _HyperlinkAction close_end: int = 0 inner_width: int = 0 @@ -56,7 +60,7 @@ class _HyperlinkResult(NamedTuple): hl_col_end: int = 0 -def _apply_sgr_wrap(result: str, sgr_at_clip_start: object) -> str: +def _apply_sgr_wrap(result: str, sgr_at_clip_start: Optional[_SGRState]) -> str: """ Apply SGR prefix/suffix around *result*. @@ -129,6 +133,7 @@ def _process_hyperlink( ) +# pylint: disable=too-many-locals def _reconstruct_painter( cells: dict[int, tuple[str, int]], sequences: list[tuple[int, int, str]], @@ -183,7 +188,8 @@ def _reconstruct_painter( return ''.join(parts) -def _clip_simple(ctx: _ClipContext) -> tuple[str, object]: +# pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements +def _clip_simple(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: """ Clip text without cursor movement (simple append-to-output path). @@ -309,7 +315,8 @@ def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: return ''.join(output), sgr_at_clip_start -def _clip_painter(ctx: _ClipContext) -> tuple[str, object]: +# pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements +def _clip_painter(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: """ Clip text with cursor movement (painter's algorithm path). diff --git a/wcwidth/_wcswidth.py b/wcwidth/_wcswidth.py index cf4317f..360f805 100644 --- a/wcwidth/_wcswidth.py +++ b/wcwidth/_wcswidth.py @@ -29,6 +29,7 @@ class GraphemeMeasurer: """ def __init__(self, text: str, end: int, wcwidth_fn: Callable[[str], int]) -> None: + """Class initializer.""" self._text = text self._end = end self._wcwidth_fn = wcwidth_fn @@ -37,6 +38,7 @@ def __init__(self, text: str, end: int, wcwidth_fn: Callable[[str], int]) -> Non self._last_was_virama = False self.conjunct_pending = False + # pylint: disable=too-complex,too-many-branches def measure_at(self, idx: int) -> tuple[int, int]: """ Process character at ``text[idx]`` and return ``(next_idx, width)``. diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index be14ac5..16f6b6b 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -312,7 +312,8 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m f'id={self._next_hyperlink_id()}:{new_state.params}') else: current_hyperlink_id = f'id={self._next_hyperlink_id()}' - line_content += HyperlinkParams(terminator=new_state.terminator, url='').make_close() + line_content += HyperlinkParams( + terminator=new_state.terminator, url='').make_close() # Also need to inject the id into the opening # sequence if it didn't have one From 92c2f41f8006a1a6ff3b7f2ffacc7662a946015f Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 12:24:38 -0400 Subject: [PATCH 55/70] add missing files :o --- tests/test_hyperlink.py | 75 +++++++++++++++++++++ wcwidth/hyperlink.py | 142 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100644 tests/test_hyperlink.py create mode 100644 wcwidth/hyperlink.py diff --git a/tests/test_hyperlink.py b/tests/test_hyperlink.py new file mode 100644 index 0000000..c2eff89 --- /dev/null +++ b/tests/test_hyperlink.py @@ -0,0 +1,75 @@ +"""Tests for OSC 8 hyperlink parsing.""" + +# 3rd party +import pytest + +# local +from wcwidth.hyperlink import Hyperlink, HyperlinkParams + +PARAMS_PARSE_VALID = [ + ('\x1b]8;;http://example.com\x07', 'http://example.com', '', '\x07'), + ('\x1b]8;id=a;http://x.com\x1b\\', 'http://x.com', 'id=a', '\x1b\\'), +] + + +@pytest.mark.parametrize('seq,url,params,term', PARAMS_PARSE_VALID) +def test_hyperlinkparams_parse_valid(seq, url, params, term): + """Parse a valid OSC 8 open sequence.""" + result = HyperlinkParams.parse(seq) + assert result is not None + assert result.url == url + assert result.params == params + assert result.terminator == term + + +@pytest.mark.parametrize('seq', [ + 'not an escape', + '\x1b[31m', + '', +]) +def test_hyperlinkparams_parse_invalid(seq): + """Parse an invalid/non-OSC-8 sequence returns None.""" + assert HyperlinkParams.parse(seq) is None + + +def test_hyperlinkparams_make_open(): + assert HyperlinkParams(url='http://x.com', params='id=a', terminator='\x07').make_open() == '\x1b]8;id=a;http://x.com\x07' + + +def test_hyperlinkparams_make_close(): + assert HyperlinkParams(url='http://x.com', terminator='\x07').make_close() == '\x1b]8;;\x07' + + +_HL = '\x1b]8;;http://ex.com\x07Hello\x1b]8;;\x07' + + +def test_hyperlink_parse_valid(): + hl = Hyperlink.parse(_HL) + assert hl is not None + assert hl.text == 'Hello' + assert hl.params.url == 'http://ex.com' + + +@pytest.mark.parametrize('text,start', [ + ('Hello world', 0), + ('\x1b[31mHello\x1b[0m', 0), # SGR, not OSC 8 + ('\x1b]8;;http://x.com\x07Hello', 0), # open without close +]) +def test_hyperlink_parse_returns_none(text, start): + assert Hyperlink.parse(text, start) is None + + +def test_hyperlink_find_close_not_found(): + assert Hyperlink.find_close('no escape here', 0) == (-1, -1) + + +def test_hyperlink_make_sequence(): + hl = Hyperlink.parse(_HL) + assert hl is not None + assert hl.make_sequence() == _HL + + +def test_hyperlink_display_width(): + hl = Hyperlink.parse(_HL) + assert hl is not None + assert hl.display_width() == 5 diff --git a/wcwidth/hyperlink.py b/wcwidth/hyperlink.py new file mode 100644 index 0000000..6a1aeb9 --- /dev/null +++ b/wcwidth/hyperlink.py @@ -0,0 +1,142 @@ +""" +OSC 8 hyperlink parsing and measurement. + +.. versionadded:: 0.7.0 +""" + +from __future__ import annotations + +# std imports +import re + +import typing + +# local +from ._width import width as _width +from .escape_sequences import _SEQUENCE_CLASSIFY + +HYPERLINK_OPEN_RE = re.compile(r'\x1b]8;([^;]*);([^\x07\x1b]*)(\x07|\x1b\\)') +HYPERLINK_CLOSE_RE = re.compile(r'\x1b]8;;(\x07|\x1b\\)') + + +class HyperlinkParams(typing.NamedTuple): + r""" + Parsed parameters from an OSC 8 hyperlink open sequence. + + :param url: The hyperlink URL. + :param params: Colon-separated metadata string (often empty). + :param terminator: Sequence terminator (``\x07`` or ``\x1b\\``). + """ + + url: str + params: str = '' + terminator: str = '\x07' + + @classmethod + def parse(cls, seq: str) -> HyperlinkParams | None: + r""" + Parse an OSC 8 open sequence string. + + Returns ``None`` if *seq* is not a valid OSC 8 open. + + Example:: + + >>> HyperlinkParams.parse('\x1b]8;;http://example.com\x07') + HyperlinkParams(url='http://example.com', params='', terminator='\\x07') + """ + m = HYPERLINK_OPEN_RE.match(seq) + if m is None: + return None + return cls(url=m.group(2), params=m.group(1), terminator=m.group(3)) + + def make_open(self) -> str: + """Generate the OSC 8 open escape sequence.""" + return f'\x1b]8;{self.params};{self.url}{self.terminator}' + + def make_close(self) -> str: + """Generate the OSC 8 close escape sequence.""" + return f'\x1b]8;;{self.terminator}' + + +class Hyperlink(typing.NamedTuple): + """ + A complete OSC 8 hyperlink with target and inner text. + + :param params: Parsed open sequence parameters. + :param text: Inner text between the open and close sequences. + """ + + params: HyperlinkParams + text: str + + @classmethod + def find_close(cls, text: str, open_end: int) -> tuple[int, int]: + """ + Find the matching OSC 8 close sequence. + + Searches 'text' starting at 'open_end', the position just past the open + sequence. Returns position of close sequence ``(close_start, + close_end)`` or ``(-1, -1)`` if not found. + + Per the OSC 8 specification, terminal emulators treat hyperlinks as a + state attribute, not as nested HTML anchors. A close sequence closes + the current hyperlink regardless of how many open sequences preceded it. + """ + m = HYPERLINK_CLOSE_RE.search(text, open_end) + if m is None: + return (-1, -1) + return (m.start(), m.end()) + + def display_width( + self, + *, + control_codes: typing.Literal['parse', 'strict', 'ignore'] = 'parse', + tabsize: int = 8, + ambiguous_width: int = 1, + ) -> int: + r""" + Measure the display width of the hyperlink's inner text. + + Delegates to :func:`wcwidth.width` with the given parameters. + + Example:: + + >>> hl = Hyperlink.parse('\x1b]8;;http://ex.com\x07Hello\x1b]8;;\x07', 0) + >>> hl.display_width() + 5 + """ + return _width( + self.text, + control_codes=control_codes, + tabsize=tabsize, + ambiguous_width=ambiguous_width, + ) + + @classmethod + def parse(cls, text: str, start: int = 0) -> Hyperlink | None: + r""" + Parse a complete OSC 8 hyperlink unit from *text* at position *start*. + + Locates the open sequence, finds the matching close, and returns a + ``Hyperlink`` containing the parsed parameters and inner text. Returns + ``None`` if the text at *start* is not a complete OSC 8 hyperlink. + + Example:: + + >>> Hyperlink.parse('\x1b]8;;http://ex.com\x07Hello\x1b]8;;\x07') + Hyperlink(params=HyperlinkParams(url='http://ex.com', ...), text='Hello') + """ + m = _SEQUENCE_CLASSIFY.match(text, start) + if m is None: + return None + params = HyperlinkParams.parse(m.group()) + if params is None: + return None + close_start, close_end = cls.find_close(text, m.end()) + if (close_start, close_end) == (-1, -1): + return None + return cls(params=params, text=text[m.end():close_start]) + + def make_sequence(self) -> str: + """Rebuild the complete OSC 8 hyperlink escape sequence.""" + return self.params.make_open() + self.text + self.params.make_close() From 480992444728cf4e321c7415be09ff0da0b80bfa Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 12:27:53 -0400 Subject: [PATCH 56/70] docfix --- docs/intro.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/intro.rst b/docs/intro.rst index 370833e..95224f9 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -303,16 +303,17 @@ Use `clip()`_ to extract a substring by column positions, preserving terminal se >>> clip('\x1b[31m中文\x1b[32m', 0, 3, propagate_sgr=False) '\x1b[31m中 \x1b[32m' - >>> # Cursor-left overwrites previous text (painter's algorithm) >>> clip('hello\x1b[2DXY', 0, 5) 'helXY' >>> # Carriage return resets to column 0, overwriting earlier cells >>> clip('abc\rXY', 0, 5) 'XYc' - >>> # OSC 8 hyperlink text is clipped and the hyperlink rebuilt - >>> clip('\x1b]8;;http://x.com\x07Click This link\x1b]8;;\x07', 6, 10) - '\x1b]8;;http://x.com\x07This\x1b]8;;\x07' + + >>> # even OSC 8 hyperlink text may be clipped, 'Click This link' -> 'is link' ! + >>> clip('\x1b]8;;http://x.com\x07Click This link\x1b]8;;\x07', 8, 15) + '\x1b]8;;http://example.com\x07is link\x1b]8;;\x07' + strip_sequences() ----------------- From e0e310ec689691d3206f0a66d25c61e73d3c6c94 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 12:41:36 -0400 Subject: [PATCH 57/70] doc refactor --- docs/intro.rst | 5 +---- tests/test_clip.py | 6 +++--- tests/test_clip_cursors.py | 8 ++++---- tests/test_hyperlink.py | 12 ++++++------ wcwidth/align.py | 6 ++++-- wcwidth/escape_sequences.py | 3 --- wcwidth/hyperlink.py | 6 +++--- wcwidth/sgr_state.py | 2 +- 8 files changed, 22 insertions(+), 26 deletions(-) diff --git a/docs/intro.rst b/docs/intro.rst index 95224f9..cf4d88f 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -134,9 +134,6 @@ Use function `width()`_ to measure a string with improved handling of ``control_ >>> # tabs are measured as though the string begins at a tabstop, >>> wcwidth.width('\t', tabsize=4) 4 - >>> # Kitty text sizing protocol (OSC 66) are measured, eg. 2x-scaled "Hello" - >>> wcwidth.width('\x1b]66;s=2;Hello\x07') - 10 >>> # or, all control characters can be ignored (including tab) >>> wcwidth.width('\t\n\a\r', control_codes='ignore') 0 @@ -311,7 +308,7 @@ Use `clip()`_ to extract a substring by column positions, preserving terminal se 'XYc' >>> # even OSC 8 hyperlink text may be clipped, 'Click This link' -> 'is link' ! - >>> clip('\x1b]8;;http://x.com\x07Click This link\x1b]8;;\x07', 8, 15) + >>> clip('\x1b]8;;http://example.com\x07Click This link\x1b]8;;\x07', 8, 15) '\x1b]8;;http://example.com\x07is link\x1b]8;;\x07' strip_sequences() diff --git a/tests/test_clip.py b/tests/test_clip.py index 2c17a11..d7369a0 100644 --- a/tests/test_clip.py +++ b/tests/test_clip.py @@ -181,8 +181,8 @@ def test_clip_sequences_osc_hyperlink(): (f'{OSC_START_BEL}ab{OSC_END_BEL} {OSC_START_ST}cd{OSC_END_ST}', 0, 5, f'{OSC_START_BEL}ab{OSC_END_BEL} {OSC_START_ST}cd{OSC_END_ST}'), # Hyperlink with params preserved - ('\x1b]8;id=myid;http://x.com\x07link\x1b]8;;\x07', 1, 3, - '\x1b]8;id=myid;http://x.com\x07in\x1b]8;;\x07'), + ('\x1b]8;id=myid;http://example.com\x07link\x1b]8;;\x07', 1, 3, + '\x1b]8;id=myid;http://example.com\x07in\x1b]8;;\x07'), # Hyperlink text before clip window, hyperlink within (f'before{OSC_START_BEL}link{OSC_END_BEL}', 6, 10, f'{OSC_START_BEL}link{OSC_END_BEL}'), @@ -190,7 +190,7 @@ def test_clip_sequences_osc_hyperlink(): (f'{OSC_START_BEL}\x1b[31mred link\x1b[0m{OSC_END_BEL}', 4, 8, f'{OSC_START_BEL}\x1b[31mlink\x1b[0m{OSC_END_BEL}'), # Hyperlink open without matching close -- preserved as regular sequence - ('\x1b]8;;http://x.com\x07link', 0, 4, '\x1b]8;;http://x.com\x07link'), + ('\x1b]8;;http://example.com\x07link', 0, 4, '\x1b]8;;http://example.com\x07link'), # Bare ESC between hyperlink markers ('\x1b]8;;url\x07ab\x1bxcd\x1b]8;;\x07', 0, 6, '\x1b]8;;url\x07ab\x1bxcd\x1b]8;;\x07'), diff --git a/tests/test_clip_cursors.py b/tests/test_clip_cursors.py index d53bb34..ca32146 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_cursors.py @@ -26,7 +26,7 @@ # Cursor-left with no visible tokens emitted ("\x1b[5C\x1b[2Dhi", 5, 7, {}, ""), # Cursor-left overwrites text, seq tokens preserve column spatial order - ("ab\x1b]8;;http://x.com\x07\x1b[2Dcd", 0, 4, {}, "cd\x1b]8;;http://x.com\x07"), + ("ab\x1b]8;;http://example.com\x07\x1b[2Dcd", 0, 4, {}, "cd\x1b]8;;http://example.com\x07"), # Cursor-left into wide char twice, second time on empty token triggers i < 0 break ("中\x1b[D\x1b[Da", 0, 4, {}, "a "), ('ab\x1b[5Ccd', 0, 4, {}, 'ab '), @@ -68,7 +68,7 @@ # propagate_sgr=False in painter path (225->226) ('ab\x1b[2Dcd', 0, 4, {'propagate_sgr': False}, 'cd'), # Non-SGR sequence before any visible text in painter (225->226 True) - ('\x1b]8;;http://x.com\x07ab\x1b[2Dcd', 0, 4, {}, '\x1b]8;;http://x.com\x07cd'), + ('\x1b]8;;http://example.com\x07ab\x1b[2Dcd', 0, 4, {}, '\x1b]8;;http://example.com\x07cd'), # Bare ESC at end of text in painter (239->240) ('ab\x1b[2D\x1b', 0, 2, {}, '\x1bab'), # Wide char overwritten from right side (212 orphan fixup) @@ -94,9 +94,9 @@ # HPA no-param inside clip window ('abc\x1b[GXY', 1, 4, {}, 'Yc'), # walk_col >= end with sequences at column == end (line 351) - ('\x1b[5C\x1b]8;;http://x.com\x07', 0, 5, {'propagate_sgr': False}, ' \x1b]8;;http://x.com\x07'), + ('\x1b[5C\x1b]8;;http://example.com\x07', 0, 5, {'propagate_sgr': False}, ' \x1b]8;;http://example.com\x07'), # Trailing sequences past col_limit (line 374) - ('\x1b[5C\x1b]8;;http://x.com\x07', 0, 3, {'propagate_sgr': False}, ' \x1b]8;;http://x.com\x07'), + ('\x1b[5C\x1b]8;;http://example.com\x07', 0, 3, {'propagate_sgr': False}, ' \x1b]8;;http://example.com\x07'), ]) def test_clip_cursor_sequences_expected_behaviour(text, start, end, kwargs, expected): """Verify clip() output matches terminal-visible columns after cursor moves.""" diff --git a/tests/test_hyperlink.py b/tests/test_hyperlink.py index c2eff89..7b083a8 100644 --- a/tests/test_hyperlink.py +++ b/tests/test_hyperlink.py @@ -8,7 +8,7 @@ PARAMS_PARSE_VALID = [ ('\x1b]8;;http://example.com\x07', 'http://example.com', '', '\x07'), - ('\x1b]8;id=a;http://x.com\x1b\\', 'http://x.com', 'id=a', '\x1b\\'), + ('\x1b]8;id=a;http://example.com\x1b\\', 'http://example.com', 'id=a', '\x1b\\'), ] @@ -33,27 +33,27 @@ def test_hyperlinkparams_parse_invalid(seq): def test_hyperlinkparams_make_open(): - assert HyperlinkParams(url='http://x.com', params='id=a', terminator='\x07').make_open() == '\x1b]8;id=a;http://x.com\x07' + assert HyperlinkParams(url='http://example.com', params='id=a', terminator='\x07').make_open() == '\x1b]8;id=a;http://example.com\x07' def test_hyperlinkparams_make_close(): - assert HyperlinkParams(url='http://x.com', terminator='\x07').make_close() == '\x1b]8;;\x07' + assert HyperlinkParams(url='http://example.com', terminator='\x07').make_close() == '\x1b]8;;\x07' -_HL = '\x1b]8;;http://ex.com\x07Hello\x1b]8;;\x07' +_HL = '\x1b]8;;http://example.com\x07Hello\x1b]8;;\x07' def test_hyperlink_parse_valid(): hl = Hyperlink.parse(_HL) assert hl is not None assert hl.text == 'Hello' - assert hl.params.url == 'http://ex.com' + assert hl.params.url == 'http://example.com' @pytest.mark.parametrize('text,start', [ ('Hello world', 0), ('\x1b[31mHello\x1b[0m', 0), # SGR, not OSC 8 - ('\x1b]8;;http://x.com\x07Hello', 0), # open without close + ('\x1b]8;;http://example.com\x07Hello', 0), # open without close ]) def test_hyperlink_parse_returns_none(text, start): assert Hyperlink.parse(text, start) is None diff --git a/wcwidth/align.py b/wcwidth/align.py index abc38e7..328454b 100644 --- a/wcwidth/align.py +++ b/wcwidth/align.py @@ -109,8 +109,10 @@ def center( characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. :returns: Text padded on both sides to reach ``dest_width``. - For odd-width padding, the extra cell goes on the right (matching - Python's :meth:`str.center` behavior). + For odd-width padding, the extra cell fills in the same cell position as + Python's :meth:`str.center` behavior (the left side when ``dest_width`` is + odd, the right side when ``dest_width`` is even). + See `the eccentric str.center `_. .. versionadded:: 0.3.0 diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index afa8c43..9c296de 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -159,9 +159,6 @@ def strip_sequences(text: str) -> str: r""" Return text with all terminal escape sequences removed. - For sequences containing printable text, such as OSC 8 (hyperlink), - the inner text is preserved. - Unknown or incomplete ESC sequences are preserved. :param text: String that may contain terminal escape sequences. diff --git a/wcwidth/hyperlink.py b/wcwidth/hyperlink.py index 6a1aeb9..da7a3aa 100644 --- a/wcwidth/hyperlink.py +++ b/wcwidth/hyperlink.py @@ -101,7 +101,7 @@ def display_width( Example:: - >>> hl = Hyperlink.parse('\x1b]8;;http://ex.com\x07Hello\x1b]8;;\x07', 0) + >>> hl = Hyperlink.parse('\x1b]8;;http://example.com\x07Hello\x1b]8;;\x07', 0) >>> hl.display_width() 5 """ @@ -123,8 +123,8 @@ def parse(cls, text: str, start: int = 0) -> Hyperlink | None: Example:: - >>> Hyperlink.parse('\x1b]8;;http://ex.com\x07Hello\x1b]8;;\x07') - Hyperlink(params=HyperlinkParams(url='http://ex.com', ...), text='Hello') + >>> Hyperlink.parse('\x1b]8;;http://example.com\x07Hello\x1b]8;;\x07') + Hyperlink(params=HyperlinkParams(url='http://example.com', ...), text='Hello') """ m = _SEQUENCE_CLASSIFY.match(text, start) if m is None: diff --git a/wcwidth/sgr_state.py b/wcwidth/sgr_state.py index 5d6e988..8e6e5cc 100644 --- a/wcwidth/sgr_state.py +++ b/wcwidth/sgr_state.py @@ -308,7 +308,7 @@ def propagate_sgr(lines: Sequence[str]) -> list[str]: ['\x1b[31mhello\x1b[0m', '\x1b[31mworld\x1b[0m'] This is useful in cases of making special editors and viewers, and is used for the - default modes (propagate_sgr=True) of :func:`wcwidth.width` and :func:`wcwidth.clip`. + default modes (propagate_sgr=True) of :func:`wcwidth.wrap` and :func:`wcwidth.clip`. When wrapping and clipping text containing SGR sequences, maybe a previous line enabled the BLUE color--if we are viewing *only* the line following, we would want the carry over the BLUE color, From 1e8b3c873b285b8c24dcf6a22a55b3fe8931bdaa Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 12:55:54 -0400 Subject: [PATCH 58/70] return to inline/duplicated wcswidth/width I knew this would slow it down, but I was predicting 3-5% not 30-50%! --- wcwidth/_wcswidth.py | 207 +++++++++++++++++-------------------------- wcwidth/_width.py | 107 +++++++++++++++++++--- 2 files changed, 179 insertions(+), 135 deletions(-) diff --git a/wcwidth/_wcswidth.py b/wcwidth/_wcswidth.py index 360f805..13bae8e 100644 --- a/wcwidth/_wcswidth.py +++ b/wcwidth/_wcswidth.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Callable, Optional +from typing import Optional # local from ._wcwidth import wcwidth @@ -16,123 +16,6 @@ from .table_grapheme import ISC_CONSONANT -class GraphemeMeasurer: - """ - Stateful measurer for grapheme-aware character width. - - Encapsulates the lookbehind state that must be threaded through - sequential per-character measurements by :meth:`measure_at`. - - Callers that interleave escape sequences or control codes between - characters should call :meth:`reset_adjacency` to prevent VS16 - from applying across the gap. - """ - - def __init__(self, text: str, end: int, wcwidth_fn: Callable[[str], int]) -> None: - """Class initializer.""" - self._text = text - self._end = end - self._wcwidth_fn = wcwidth_fn - self._last_measured_idx = -2 - self._last_measured_ucs = -1 - self._last_was_virama = False - self.conjunct_pending = False - - # pylint: disable=too-complex,too-many-branches - def measure_at(self, idx: int) -> tuple[int, int]: - """ - Process character at ``text[idx]`` and return ``(next_idx, width)``. - - Handles ZWJ, VS16, Regional Indicators, Fitzpatrick modifiers, virama - conjunct formation, Mc spacing marks, and standard ``wcwidth`` measurement. - - ``width`` is ``-1`` for C0/C1 control characters (caller must handle). - Callers that never pass C0/C1 characters will always receive ``width >= 0``. - """ - char = self._text[idx] - ucs = ord(char) - - # ZWJ (U+200D) - if ucs == 0x200D: - if self._last_was_virama: - return (idx + 1, 0) - if idx + 1 < self._end: - # Emoji ZWJ: skip next character unconditionally. - # Preserve _last_measured_idx so VS16 checks the emoji base - # (narrow bases get +1, wide bases are already 2 cells). - self._last_was_virama = False - return (idx + 2, 0) - self._last_was_virama = False - return (idx + 1, 0) - - # VS16 (U+FE0F): converts preceding narrow character to wide. - if ucs == 0xFE0F and self._last_measured_idx >= 0: - vs_width = bisearch( - ord(self._text[self._last_measured_idx]), - VS16_NARROW_TO_WIDE['9.0.0'], - ) - # Prevent double application; preserve emoji context (_last_measured_ucs stays) - self._last_measured_idx = -2 - return (idx + 1, vs_width) - - # Regional Indicator & Fitzpatrick (both above BMP) - if ucs > 0xFFFF: - if ucs in _REGIONAL_INDICATOR_SET: - # Lazy RI pairing: count preceding consecutive RIs - ri_before = 0 - j = idx - 1 - while j >= 0 and ord(self._text[j]) in _REGIONAL_INDICATOR_SET: - ri_before += 1 - j -= 1 - if ri_before % 2 == 1: - # Second RI in pair: zero width (pair = one 2-cell flag) - self._last_measured_ucs = ucs - return (idx + 1, 0) - # Fitzpatrick modifier: zero-width when following emoji base - elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] - and self._last_measured_ucs in _EMOJI_ZWJ_SET): - return (idx + 1, 0) - - # Virama conjunct formation - if self._last_was_virama and bisearch(ucs, ISC_CONSONANT): - self._last_measured_idx = idx - self._last_measured_ucs = ucs - self._last_was_virama = False - self.conjunct_pending = True - return (idx + 1, 0) - - # Normal character: measure with wcwidth - w = self._wcwidth_fn(char) - if w < 0: - # C0/C1 control character (returns -1: caller should handle!) - return (idx + 1, -1) - if w > 0: - extra = 1 if self.conjunct_pending else 0 - self._last_measured_idx = idx - self._last_measured_ucs = ucs - self._last_was_virama = False - self.conjunct_pending = False - return (idx + 1, w + extra) - if self._last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE): - # Spacing Combining Mark (Mc) following a base character adds 1 - self._last_measured_idx = -2 - self._last_was_virama = False - self.conjunct_pending = False - return (idx + 1, 1) - self._last_was_virama = ucs in _ISC_VIRAMA_SET - return (idx + 1, 0) - - def reset_adjacency(self) -> None: - """ - Break VS16/Fitzpatrick adjacency. - - Call after processing escape sequences or control codes to prevent VS16 and Fitzpatrick - lookbehind from applying across the gap. - """ - self._last_measured_idx = -2 - self._last_measured_ucs = -1 - - def wcswidth( pwcs: str, n: Optional[int] = None, @@ -168,8 +51,8 @@ def wcswidth( """ # pylint: disable=unused-argument,too-many-locals,too-many-statements # pylint: disable=too-complex,too-many-branches,duplicate-code - # This function intentionally kept long without delegating functions to reduce function calls in - # "hot path", the overhead per-character adds up. + # This function intentionally keeps all logic inline for performance — + # local variable state tracking avoids per-character method-call overhead. # Fast path: pure ASCII printable strings are always width == length if n is None and pwcs.isascii() and pwcs.isprintable(): @@ -181,12 +64,88 @@ def wcswidth( end = len(pwcs) if n is None else n total_width = 0 idx = 0 - measurer = GraphemeMeasurer(pwcs, end, _wcwidth) + + # grapheme-clustering state + last_measured_idx = -2 + last_measured_ucs = -1 + last_was_virama = False + conjunct_pending = False + while idx < end: - idx, w = measurer.measure_at(idx) + char = pwcs[idx] + ucs = ord(char) + + # ZWJ (U+200D) + if ucs == 0x200D: + if last_was_virama: + idx += 1 + elif idx + 1 < end: + last_was_virama = False + idx += 2 + else: + last_was_virama = False + idx += 1 + continue + + # VS16 (U+FE0F): converts preceding narrow character to wide. + if ucs == 0xFE0F and last_measured_idx >= 0: + total_width += bisearch( + ord(pwcs[last_measured_idx]), + VS16_NARROW_TO_WIDE['9.0.0'], + ) + last_measured_idx = -2 # prevent double application + idx += 1 + continue + + # Regional Indicator & Fitzpatrick (both above BMP) + if ucs > 0xFFFF: + if ucs in _REGIONAL_INDICATOR_SET: + ri_before = 0 + j = idx - 1 + while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET: + ri_before += 1 + j -= 1 + if ri_before % 2 == 1: + last_measured_ucs = ucs + idx += 1 + continue + elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] + and last_measured_ucs in _EMOJI_ZWJ_SET): + idx += 1 + continue + + # Virama conjunct formation + if last_was_virama and bisearch(ucs, ISC_CONSONANT): + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + conjunct_pending = True + idx += 1 + continue + + # Normal character: measure with wcwidth + w = _wcwidth(char) if w < 0: + # C0/C1 control character return -1 - total_width += w - if measurer.conjunct_pending: + if w > 0: + if conjunct_pending: + total_width += 1 + conjunct_pending = False + total_width += w + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + elif last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE): + # Spacing Combining Mark (Mc) following a base character adds 1 + total_width += 1 + last_measured_idx = -2 + last_was_virama = False + conjunct_pending = False + else: + last_was_virama = ucs in _ISC_VIRAMA_SET + idx += 1 + + if conjunct_pending: total_width += 1 return total_width diff --git a/wcwidth/_width.py b/wcwidth/_width.py index 42982c5..b8d7cda 100644 --- a/wcwidth/_width.py +++ b/wcwidth/_width.py @@ -4,8 +4,16 @@ # local from ._wcwidth import wcwidth -from ._wcswidth import GraphemeMeasurer, wcswidth +from .bisearch import bisearch +from ._wcswidth import wcswidth +from ._constants import (_EMOJI_ZWJ_SET, + _ISC_VIRAMA_SET, + _CATEGORY_MC_TABLE, + _FITZPATRICK_RANGE, + _REGIONAL_INDICATOR_SET) +from .table_vs16 import VS16_NARROW_TO_WIDE from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL +from .table_grapheme import ISC_CONSONANT from .escape_sequences import (_SEQUENCE_CLASSIFY, CURSOR_MOVEMENT_SEQUENCE, INDETERMINATE_EFFECT_SEQUENCE, @@ -137,7 +145,12 @@ def width( # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) - measurer = GraphemeMeasurer(text, text_len, _wcwidth) + + # grapheme-clustering state + last_measured_idx = -2 + last_measured_ucs = -1 + last_was_virama = False + conjunct_pending = False while idx < text_len: char = text[idx] @@ -178,7 +191,8 @@ def width( # 2d. SGR and other zero-width sequences -- no column advance idx = m.end() # Escape sequences break VS16 adjacency: reset last-measured state - measurer.reset_adjacency() + last_measured_idx = -2 + last_measured_ucs = -1 max_extent = max(max_extent, current_col) continue @@ -187,14 +201,16 @@ def width( if strict: raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}") idx += 1 - measurer.reset_adjacency() + last_measured_idx = -2 + last_measured_ucs = -1 continue if char in VERTICAL_CTRL: if strict: raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}") idx += 1 - measurer.reset_adjacency() + last_measured_idx = -2 + last_measured_ucs = -1 continue # 3. Horizontal movement characters @@ -213,22 +229,91 @@ def width( current_col = 0 max_extent = max(max_extent, current_col) idx += 1 - measurer.reset_adjacency() + last_measured_idx = -2 + last_measured_ucs = -1 continue # 4. Zero-width control characters if char in ZERO_WIDTH_CTRL: idx += 1 - measurer.reset_adjacency() + last_measured_idx = -2 + last_measured_ucs = -1 + continue + + # 5. Inline grapheme-clustering: ZWJ, VS16, Regional Indicators, + # Fitzpatrick, Virama conjuncts, Mc, wcwidth + ucs = ord(char) + + # ZWJ (U+200D) + if ucs == 0x200D: + if last_was_virama: + idx += 1 + elif idx + 1 < text_len: + last_was_virama = False + idx += 2 + else: + last_was_virama = False + idx += 1 + continue + + # VS16 (U+FE0F): converts preceding narrow character to wide. + if ucs == 0xFE0F and last_measured_idx >= 0: + if bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE['9.0.0']): + current_col += 1 + max_extent = max(max_extent, current_col) + last_measured_idx = -2 # prevent double application + idx += 1 + continue + + # Regional Indicator & Fitzpatrick (both above BMP) + if ucs > 0xFFFF: + if ucs in _REGIONAL_INDICATOR_SET: + ri_before = 0 + j = idx - 1 + while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET: + ri_before += 1 + j -= 1 + if ri_before % 2 == 1: + last_measured_ucs = ucs + idx += 1 + continue + elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] + and last_measured_ucs in _EMOJI_ZWJ_SET): + idx += 1 + continue + + # Virama conjunct formation + if last_was_virama and bisearch(ucs, ISC_CONSONANT): + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + conjunct_pending = True + idx += 1 continue - # 5. ZWJ, VS16, Regional Indicators, Fitzpatrick, Virama conjuncts, Mc, wcwidth - idx, w = measurer.measure_at(idx) + # Normal character: measure with wcwidth + w = _wcwidth(char) if w > 0: + if conjunct_pending: + current_col += 1 + conjunct_pending = False current_col += w max_extent = max(max_extent, current_col) - - if measurer.conjunct_pending: + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + elif last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE): + # Spacing Combining Mark (Mc) following a base character adds 1 + current_col += 1 + max_extent = max(max_extent, current_col) + last_measured_idx = -2 + last_was_virama = False + conjunct_pending = False + else: + last_was_virama = ucs in _ISC_VIRAMA_SET + idx += 1 + + if conjunct_pending: current_col += 1 max_extent = max(max_extent, current_col) return max_extent From 5e79b040f6787e82ccbd105deded621c8ab8b835 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 13:26:56 -0400 Subject: [PATCH 59/70] inline performance refactor and variable names for readaibility --- wcwidth/_clip.py | 213 +++++++++++++++++++++----------------------- wcwidth/textwrap.py | 2 +- 2 files changed, 102 insertions(+), 113 deletions(-) diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index 761ddf3..59d03e6 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -60,17 +60,17 @@ class _HyperlinkResult(NamedTuple): hl_col_end: int = 0 -def _apply_sgr_wrap(result: str, sgr_at_clip_start: Optional[_SGRState]) -> str: +def _apply_sgr_wrap(result: str, captured_style: Optional[_SGRState]) -> str: """ Apply SGR prefix/suffix around *result*. If an SGR state was captured at the first visible character, prefix the result with the corresponding SGR sequence and suffix with a reset if any styles are active. """ - if sgr_at_clip_start is not None: - if prefix := _sgr_state_to_sequence(sgr_at_clip_start): + if captured_style is not None: + if prefix := _sgr_state_to_sequence(captured_style): result = prefix + result - if _sgr_state_is_active(sgr_at_clip_start): + if _sgr_state_is_active(captured_style): result += '\x1b[0m' return result @@ -193,62 +193,29 @@ def _clip_simple(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: """ Clip text without cursor movement (simple append-to-output path). - Returns ``(result, sgr_at_clip_start)``. The caller applies SGR wrapping. + Returns ``(result, captured_style)``. The caller applies SGR wrapping. """ # Bind hot-path attributes to locals (LOAD_FAST instead of LOAD_ATTR). _text = ctx.text _end = ctx.end _start = ctx.start - _propg = ctx.propagate_sgr + _track_sgr = ctx.propagate_sgr _ambw = ctx.ambiguous_width - _fillchar = ctx.fillchar - _tabsize = ctx.tabsize - _strict = ctx.strict output: list[str] = [] col = 0 idx = 0 - sgr_at_clip_start = None - sgr = _SGR_STATE_DEFAULT if _propg else None - - def _mark() -> None: - nonlocal sgr_at_clip_start - if _propg and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - - def _emit_tab(col: int) -> int: - """Expand tab, appending spaces to output list.""" - if _tabsize > 0: - next_tab = col + (_tabsize - (col % _tabsize)) - while col < next_tab: - if _start <= col < _end: - output.append(' ') - _mark() - col += 1 - else: - output.append('\t') - return col - - def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: - """Emit grapheme to output list based on visibility.""" - if gw == 0: - if _start <= col < _end: - output.append(grapheme) - elif col >= _start and col + gw <= _end: - output.append(grapheme) - _mark() - elif col < _end and col + gw > _start: - output.append(_fillchar * (min(_end, col + gw) - max(_start, col))) - _mark() + captured_style = None # snapshot of current_style at first visible character + current_style = _SGR_STATE_DEFAULT if _track_sgr else None while idx < len(_text): char = _text[idx] # Early exit: past visible region. if col >= _end and char not in '\r\x08\t\x1b': - if sgr_at_clip_start is not None: + if captured_style is not None: break - if not _propg: + if not _track_sgr: next_esc = _text.find('\x1b', idx + 1) if next_esc == -1: break @@ -262,9 +229,9 @@ def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: idx += 1 continue - # SGR: update state, do not emit. - if m.group('sgr_params') is not None and _propg and sgr is not None: - sgr = _sgr_state_update(sgr, m.group()) + # SGR: update current_style, do not emit. + if m.group('sgr_params') is not None and _track_sgr and current_style is not None: + current_style = _sgr_state_update(current_style, m.group()) idx = m.end() continue @@ -283,14 +250,15 @@ def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: output.append(r.open_seq) output.append(r.clipped_inner) output.append(r.close_seq) - _mark() + if _track_sgr and captured_style is None: + captured_style = current_style col += r.inner_width idx = r.close_end continue # Indeterminate-effect sequences: raise in strict mode. seq = m.group() - if _strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): + if ctx.strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): raise ValueError( f"Indeterminate cursor sequence at position {idx}, " f"{seq!r}" @@ -302,17 +270,40 @@ def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: continue if char == '\t': - col = _emit_tab(col) + # Expand tab, filling clip window with spaces. + if ctx.tabsize > 0: + next_tab = col + (ctx.tabsize - (col % ctx.tabsize)) + while col < next_tab: + if _start <= col < _end: + output.append(' ') + if _track_sgr and captured_style is None: + captured_style = current_style + col += 1 + else: + output.append('\t') idx += 1 continue grapheme = next(iter_graphemes(_text, start=idx)) grapheme_w = width(grapheme, ambiguous_width=_ambw) - _handle_grapheme(grapheme, grapheme_w, col) + + # Emit grapheme or fillchar depending on visibility within clip window. + if grapheme_w == 0: + if _start <= col < _end: + output.append(grapheme) + elif col >= _start and col + grapheme_w <= _end: + output.append(grapheme) + if _track_sgr and captured_style is None: + captured_style = current_style + elif col < _end and col + grapheme_w > _start: + output.append(ctx.fillchar * (min(_end, col + grapheme_w) - max(_start, col))) + if _track_sgr and captured_style is None: + captured_style = current_style + col += grapheme_w idx += len(grapheme) - return ''.join(output), sgr_at_clip_start + return ''.join(output), captured_style # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements @@ -320,17 +311,15 @@ def _clip_painter(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: """ Clip text with cursor movement (painter's algorithm path). - Returns ``(result, sgr_at_clip_start)``. The caller applies SGR wrapping. + Returns ``(result, captured_style)``. The caller applies SGR wrapping. """ # Bind hot-path attributes to locals (LOAD_FAST instead of LOAD_ATTR). _text = ctx.text _end = ctx.end _start = ctx.start - _propg = ctx.propagate_sgr + _track_sgr = ctx.propagate_sgr _ambw = ctx.ambiguous_width _fillchar = ctx.fillchar - _tabsize = ctx.tabsize - _strict = ctx.strict cells: dict[int, tuple[str, int]] = {} hyperlink_cells: set[int] = set() @@ -339,17 +328,13 @@ def _clip_painter(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: col = 0 idx = 0 - sgr_at_clip_start = None - sgr = _SGR_STATE_DEFAULT if _propg else None - - def _mark() -> None: - nonlocal sgr_at_clip_start - if _propg and sgr_at_clip_start is None: - sgr_at_clip_start = sgr + captured_style = None # snapshot of current_style at first visible character + current_style = _SGR_STATE_DEFAULT if _track_sgr else None def _write_cells(s: str, w: int, write_col: int, is_hyperlink: bool = False) -> None: """Write *w* cells of text *s* at *write_col*, handling wide-char splitting.""" + nonlocal captured_style for offset in range(w): src_col = write_col + offset if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: @@ -364,60 +349,30 @@ def _write_cells(s: str, w: int, write_col: int, if is_hyperlink: for offset in range(w): hyperlink_cells.add(write_col + offset) - _mark() - - def _append_seq(seq: str, at_col: Optional[int] = None) -> None: - """Append a zero-width escape sequence anchored at the current column.""" - nonlocal seq_order - c = col if at_col is None else at_col - sequences.append((c, seq_order, seq)) - seq_order += 1 - _mark() - - def _emit_tab(col: int) -> int: - """Expand tab for painter-path.""" - if _tabsize > 0: - next_tab = col + (_tabsize - (col % _tabsize)) - while col < next_tab: - if _start <= col < _end: - _write_cells(' ', 1, col) - col += 1 - else: - _append_seq('\t') - return col - - def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: - """Emit grapheme to painter-path based on visibility.""" - if gw == 0: - if _start <= col < _end: - _append_seq(grapheme) - elif col >= _start and col + gw <= _end: - _write_cells(grapheme, gw, col) - elif col < _end and col + gw > _start: - clip_start = max(_start, col) - for offset in range(min(_end, col + gw) - clip_start): - _write_cells(_fillchar, 1, clip_start + offset) + if _track_sgr and captured_style is None: + captured_style = current_style while idx < len(_text): char = _text[idx] # Early exit: past visible region, SGR captured, no escape ahead. - # Note: unlike _clip_simple, we cannot skip past non-escape chars when - # propagate_sgr is False, because cursor movements (\r, \x08, CSI C/D) - # depend on accurate column tracking and may move back into the visible region. - if col >= _end and sgr_at_clip_start is not None and char != '\x1b': + if col >= _end and captured_style is not None and char != '\x1b': break if char == '\x1b': m = _SEQUENCE_CLASSIFY.match(_text, idx) if not m: - _append_seq(char) + # Record lone ESC as a zero-width sequence at current column. + sequences.append((col, seq_order, char)) + seq_order += 1 + if _track_sgr and captured_style is None: + captured_style = current_style idx += 1 continue - # SGR: update state, do not emit. - if m.group('sgr_params') is not None and _propg and sgr is not None: - sgr = _sgr_state_update(sgr, m.group()) + # SGR: update current_style, do not emit. + if m.group('sgr_params') is not None and _track_sgr and current_style is not None: + current_style = _sgr_state_update(current_style, m.group()) idx = m.end() continue @@ -425,7 +380,10 @@ def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: if hl_state := HyperlinkParams.parse(m.group()): r = _process_hyperlink(ctx, hl_state, m.end(), col) if r.action is _HyperlinkAction.NO_CLOSE: - _append_seq(m.group()) + sequences.append((col, seq_order, m.group())) + seq_order += 1 + if _track_sgr and captured_style is None: + captured_style = current_style idx = m.end() elif r.action is _HyperlinkAction.EMPTY: idx = r.close_end @@ -433,18 +391,22 @@ def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: col += r.inner_width idx = r.close_end else: - _append_seq(r.open_seq) + sequences.append((col, seq_order, r.open_seq)) + seq_order += 1 + if _track_sgr and captured_style is None: + captured_style = current_style _write_cells(r.clipped_inner, r.clipped_width, col, is_hyperlink=True) col += r.clipped_width - _append_seq(r.close_seq, at_col=col) + sequences.append((col, seq_order, r.close_seq)) + seq_order += 1 col = r.hl_col_end idx = r.close_end continue # Indeterminate-effect sequences: raise in strict mode. seq = m.group() - if _strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): + if ctx.strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): raise ValueError( f"Indeterminate cursor sequence at position {idx}, " f"{seq!r}" @@ -470,7 +432,7 @@ def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: # Cursor Backward (CSI n D). if (cbackward_n := m.group('cbackward_n')) is not None: n_backward = int(cbackward_n) if cbackward_n else 1 - if _strict and n_backward > col: + if ctx.strict and n_backward > col: raise ValueError( f"Cursor left movement at position {idx} would move " f"{n_backward} cells left from column {col}, " @@ -481,7 +443,10 @@ def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: continue # Any other recognized sequence: preserve as-is. - _append_seq(m.group()) + sequences.append((col, seq_order, m.group())) + seq_order += 1 + if _track_sgr and captured_style is None: + captured_style = current_style idx = m.end() continue @@ -500,21 +465,45 @@ def _handle_grapheme(grapheme: str, gw: int, col: int) -> None: # Tab expansion. if char == '\t': - col = _emit_tab(col) + if ctx.tabsize > 0: + next_tab = col + (ctx.tabsize - (col % ctx.tabsize)) + while col < next_tab: + if _start <= col < _end: + _write_cells(_fillchar, 1, col) + col += 1 + else: + sequences.append((col, seq_order, '\t')) + seq_order += 1 + if _track_sgr and captured_style is None: + captured_style = current_style idx += 1 continue # Grapheme cluster. grapheme = next(iter_graphemes(_text, start=idx)) grapheme_w = width(grapheme, ambiguous_width=_ambw) - _handle_grapheme(grapheme, grapheme_w, col) + + # Emit grapheme or fillchar depending on visibility within clip window. + if grapheme_w == 0: + if _start <= col < _end: + sequences.append((col, seq_order, grapheme)) + seq_order += 1 + if _track_sgr and captured_style is None: + captured_style = current_style + elif col >= _start and col + grapheme_w <= _end: + _write_cells(grapheme, grapheme_w, col) + elif col < _end and col + grapheme_w > _start: + clip_start = max(_start, col) + for offset in range(min(_end, col + grapheme_w) - clip_start): + _write_cells(_fillchar, 1, clip_start + offset) + col += grapheme_w idx += len(grapheme) result = _reconstruct_painter( cells, sequences, _start, _end, _fillchar, ) - return result, sgr_at_clip_start + return result, captured_style def clip( diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index 16f6b6b..93eaec8 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -345,7 +345,7 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m lines.append(indent + line_content) is_first_line = False else: - # max_lines reached with remaining content -- + # max_lines reached with remaining content. # pop chunks until placeholder fits, then break. placeholder_w = self._width(self.placeholder) while current_line: From 6490d47c3ace8a59d079bdd9871c8f6945d4dcc7 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 14:00:20 -0400 Subject: [PATCH 60/70] refactor, no _ClipContext --- docs/intro.rst | 5 + tests/test_benchmarks.py | 24 ++++ wcwidth/_clip.py | 261 ++++++++++++++++++++++----------------- 3 files changed, 175 insertions(+), 115 deletions(-) diff --git a/docs/intro.rst b/docs/intro.rst index cf4d88f..687409e 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -311,6 +311,11 @@ Use `clip()`_ to extract a substring by column positions, preserving terminal se >>> clip('\x1b]8;;http://example.com\x07Click This link\x1b]8;;\x07', 8, 15) '\x1b]8;;http://example.com\x07is link\x1b]8;;\x07' +Use ``overtyping=False`` when the input is known not to contain any cursor movement characters +(``\b``, ``\r``, ``CSI C``, ``CSI D``, ``CSI G``) for improved performance. When +``overtyping=None`` (default), a slower "Painter's algorithm" may be used after testing for the +presence of these characters. ``overtyping`` has no effect when ``control_codes='ignore'``. + strip_sequences() ----------------- diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 76bd06c..b878578 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -335,6 +335,30 @@ def test_clip_long_ascii_fastpath(benchmark): benchmark(wcwidth.clip, text, 500, 600) +def test_clip_with_ansi_no_overtype(benchmark): + """Benchmark clip() with ANSI sequences, overtyping disabled.""" + text = '\x1b[31m中文字\x1b[0m' + benchmark(wcwidth.clip, text, 0, 3, overtyping=False) + + +def test_clip_complex_sgr_no_overtype(benchmark): + """Benchmark clip() with complex SGR, overtyping disabled.""" + text = '\x1b[1;38;5;208mHello world text\x1b[0m' + benchmark(wcwidth.clip, text, 6, 11, overtyping=False) + + +def test_clip_dense_ansi_no_overtype(benchmark): + """Benchmark clip() with dense ANSI, overtyping disabled.""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, overtyping=False) + + +def test_clip_dense_ansi_no_propagate_no_overtype(benchmark): + """Benchmark clip() with dense ANSI, SGR propagation and overtyping disabled.""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, propagate_sgr=False, overtyping=False) + + def test_propagate_sgr_multiline(benchmark): """Benchmark propagate_sgr() with multiple lines.""" lines = ['\x1b[1;31mline one', 'line two', 'line three\x1b[0m'] diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index 59d03e6..28254a7 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -20,20 +20,6 @@ INDETERMINATE_EFFECT_SEQUENCE) -class _ClipContext(NamedTuple): - """Immutable parameters for a clip operation.""" - - text: str - start: int - end: int - fillchar: str - tabsize: int - ambiguous_width: int - control_codes: Literal['parse', 'strict', 'ignore'] - strict: bool - propagate_sgr: bool - - class _HyperlinkAction(enum.Enum): """Outcome of processing an OSC 8 hyperlink unit.""" @@ -76,7 +62,14 @@ def _apply_sgr_wrap(result: str, captured_style: Optional[_SGRState]) -> str: def _process_hyperlink( - ctx: _ClipContext, + text: str, + start: int, + end: int, + fillchar: str, + tabsize: int, + ambiguous_width: int, + control_codes: Literal['parse', 'strict', 'ignore'], + *, params: HyperlinkParams, match_end: int, col: int, @@ -87,13 +80,14 @@ def _process_hyperlink( Finds the matching close sequence, measures the inner text width, and determines whether the hyperlink is empty, outside the clip window, or visible (requiring inner-text clipping). """ - close_start, close_end = Hyperlink.find_close(ctx.text, match_end) + # pylint: disable=too-many-locals,too-many-positional-arguments + close_start, close_end = Hyperlink.find_close(text, match_end) if (close_start, close_end) == (-1, -1): return _HyperlinkResult(_HyperlinkAction.NO_CLOSE) - inner_text = ctx.text[match_end:close_start] + inner_text = text[match_end:close_start] inner_width = width( - inner_text, control_codes=ctx.control_codes, - tabsize=ctx.tabsize, ambiguous_width=ctx.ambiguous_width, + inner_text, control_codes=control_codes, + tabsize=tabsize, ambiguous_width=ambiguous_width, ) if inner_width == 0: @@ -101,24 +95,24 @@ def _process_hyperlink( hl_col_end = col + inner_width - if hl_col_end <= ctx.start or col >= ctx.end: + if hl_col_end <= start or col >= end: return _HyperlinkResult(_HyperlinkAction.OUTSIDE, close_end=close_end, inner_width=inner_width) - inner_clip_start = max(0, ctx.start - col) - inner_clip_end = ctx.end - col + inner_clip_start = max(0, start - col) + inner_clip_end = end - col clipped_inner = clip( inner_text, inner_clip_start, inner_clip_end, - fillchar=ctx.fillchar, tabsize=ctx.tabsize, - ambiguous_width=ctx.ambiguous_width, + fillchar=fillchar, tabsize=tabsize, + ambiguous_width=ambiguous_width, propagate_sgr=False, - control_codes=ctx.control_codes, + control_codes=control_codes, ) clipped_width = width( - clipped_inner, control_codes=ctx.control_codes, - tabsize=ctx.tabsize, ambiguous_width=ctx.ambiguous_width, + clipped_inner, control_codes=control_codes, + tabsize=tabsize, ambiguous_width=ambiguous_width, ) return _HyperlinkResult( @@ -133,7 +127,6 @@ def _process_hyperlink( ) -# pylint: disable=too-many-locals def _reconstruct_painter( cells: dict[int, tuple[str, int]], sequences: list[tuple[int, int, str]], @@ -147,6 +140,7 @@ def _reconstruct_painter( Walks columns left-to-right, interleaving escape sequences and cell content, filling gaps with *fillchar*. """ + # pylint: disable=too-many-locals # Group and sort sequences by column, preserving insertion order within each. seqs_by_col: dict[int, list[tuple[int, str]]] = {} for col_pos, order, seq_text in sequences: @@ -188,56 +182,67 @@ def _reconstruct_painter( return ''.join(parts) -# pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements -def _clip_simple(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: +def _clip_simple( + text: str, + start: int, + end: int, + *, + propagate_sgr: bool, + ambiguous_width: int, + fillchar: str, + tabsize: int, + strict: bool, + control_codes: Literal['parse', 'strict', 'ignore'], +) -> tuple[str, Optional[_SGRState]]: """ Clip text without cursor movement (simple append-to-output path). Returns ``(result, captured_style)``. The caller applies SGR wrapping. """ - # Bind hot-path attributes to locals (LOAD_FAST instead of LOAD_ATTR). - _text = ctx.text - _end = ctx.end - _start = ctx.start - _track_sgr = ctx.propagate_sgr - _ambw = ctx.ambiguous_width + # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements + # pylint: disable=too-many-nested-blocks + # code length and complexity traded for performance, to allow this to be used as a "hot path" output: list[str] = [] col = 0 idx = 0 captured_style = None # snapshot of current_style at first visible character - current_style = _SGR_STATE_DEFAULT if _track_sgr else None + current_style = _SGR_STATE_DEFAULT if propagate_sgr else None - while idx < len(_text): - char = _text[idx] + while idx < len(text): + char = text[idx] # Early exit: past visible region. - if col >= _end and char not in '\r\x08\t\x1b': + if col >= end and char not in '\r\x08\t\x1b': if captured_style is not None: break - if not _track_sgr: - next_esc = _text.find('\x1b', idx + 1) + if not propagate_sgr: + next_esc = text.find('\x1b', idx + 1) if next_esc == -1: break idx = next_esc continue if char == '\x1b': - m = _SEQUENCE_CLASSIFY.match(_text, idx) + m = _SEQUENCE_CLASSIFY.match(text, idx) if not m: output.append(char) idx += 1 continue # SGR: update current_style, do not emit. - if m.group('sgr_params') is not None and _track_sgr and current_style is not None: + if m.group('sgr_params') is not None and propagate_sgr and current_style is not None: current_style = _sgr_state_update(current_style, m.group()) idx = m.end() continue # OSC 8 hyperlink. if hl_state := HyperlinkParams.parse(m.group()): - r = _process_hyperlink(ctx, hl_state, m.end(), col) + r = _process_hyperlink( + text, start, end, fillchar, tabsize, ambiguous_width, + control_codes, + params=hl_state, match_end=m.end(), col=col, + ) if r.action is _HyperlinkAction.NO_CLOSE: output.append(m.group()) idx = m.end() @@ -250,7 +255,7 @@ def _clip_simple(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: output.append(r.open_seq) output.append(r.clipped_inner) output.append(r.close_seq) - if _track_sgr and captured_style is None: + if propagate_sgr and captured_style is None: captured_style = current_style col += r.inner_width idx = r.close_end @@ -258,7 +263,7 @@ def _clip_simple(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: # Indeterminate-effect sequences: raise in strict mode. seq = m.group() - if ctx.strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): + if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): raise ValueError( f"Indeterminate cursor sequence at position {idx}, " f"{seq!r}" @@ -271,12 +276,12 @@ def _clip_simple(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: if char == '\t': # Expand tab, filling clip window with spaces. - if ctx.tabsize > 0: - next_tab = col + (ctx.tabsize - (col % ctx.tabsize)) + if tabsize > 0: + next_tab = col + (tabsize - (col % tabsize)) while col < next_tab: - if _start <= col < _end: + if start <= col < end: output.append(' ') - if _track_sgr and captured_style is None: + if propagate_sgr and captured_style is None: captured_style = current_style col += 1 else: @@ -284,20 +289,20 @@ def _clip_simple(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: idx += 1 continue - grapheme = next(iter_graphemes(_text, start=idx)) - grapheme_w = width(grapheme, ambiguous_width=_ambw) + grapheme = next(iter_graphemes(text, start=idx)) + grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) # Emit grapheme or fillchar depending on visibility within clip window. if grapheme_w == 0: - if _start <= col < _end: + if start <= col < end: output.append(grapheme) - elif col >= _start and col + grapheme_w <= _end: + elif col >= start and col + grapheme_w <= end: output.append(grapheme) - if _track_sgr and captured_style is None: + if propagate_sgr and captured_style is None: captured_style = current_style - elif col < _end and col + grapheme_w > _start: - output.append(ctx.fillchar * (min(_end, col + grapheme_w) - max(_start, col))) - if _track_sgr and captured_style is None: + elif col < end and col + grapheme_w > start: + output.append(fillchar * (min(end, col + grapheme_w) - max(start, col))) + if propagate_sgr and captured_style is None: captured_style = current_style col += grapheme_w @@ -306,20 +311,26 @@ def _clip_simple(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: return ''.join(output), captured_style -# pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements -def _clip_painter(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: +def _clip_painter( + text: str, + start: int, + end: int, + *, + propagate_sgr: bool, + ambiguous_width: int, + fillchar: str, + tabsize: int, + strict: bool, + control_codes: Literal['parse', 'strict', 'ignore'], +) -> tuple[str, Optional[_SGRState]]: """ Clip text with cursor movement (painter's algorithm path). Returns ``(result, captured_style)``. The caller applies SGR wrapping. """ - # Bind hot-path attributes to locals (LOAD_FAST instead of LOAD_ATTR). - _text = ctx.text - _end = ctx.end - _start = ctx.start - _track_sgr = ctx.propagate_sgr - _ambw = ctx.ambiguous_width - _fillchar = ctx.fillchar + # pylint: disable=too-complex,too-many-locals,too-many-branches + # pylint: disable=too-many-statements,too-many-nested-blocks + # code length and complexity traded for performance, to allow this to be used as a "hot path" cells: dict[int, tuple[str, int]] = {} hyperlink_cells: set[int] = set() @@ -329,7 +340,7 @@ def _clip_painter(ctx: _ClipContext) -> tuple[str, Optional[_SGRState]]: col = 0 idx = 0 captured_style = None # snapshot of current_style at first visible character - current_style = _SGR_STATE_DEFAULT if _track_sgr else None + current_style = _SGR_STATE_DEFAULT if propagate_sgr else None def _write_cells(s: str, w: int, write_col: int, is_hyperlink: bool = False) -> None: @@ -338,10 +349,10 @@ def _write_cells(s: str, w: int, write_col: int, for offset in range(w): src_col = write_col + offset if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: - cells[src_col - 1] = (_fillchar, 1) + cells[src_col - 1] = (fillchar, 1) hyperlink_cells.discard(src_col - 1) if cells.get(src_col, ('', 0))[1] == 2: - cells[src_col + 1] = (_fillchar, 1) + cells[src_col + 1] = (fillchar, 1) hyperlink_cells.discard(src_col + 1) cells.pop(src_col, None) hyperlink_cells.discard(src_col) @@ -349,40 +360,44 @@ def _write_cells(s: str, w: int, write_col: int, if is_hyperlink: for offset in range(w): hyperlink_cells.add(write_col + offset) - if _track_sgr and captured_style is None: + if propagate_sgr and captured_style is None: captured_style = current_style - while idx < len(_text): - char = _text[idx] + while idx < len(text): + char = text[idx] # Early exit: past visible region, SGR captured, no escape ahead. - if col >= _end and captured_style is not None and char != '\x1b': + if col >= end and captured_style is not None and char != '\x1b': break if char == '\x1b': - m = _SEQUENCE_CLASSIFY.match(_text, idx) + m = _SEQUENCE_CLASSIFY.match(text, idx) if not m: # Record lone ESC as a zero-width sequence at current column. sequences.append((col, seq_order, char)) seq_order += 1 - if _track_sgr and captured_style is None: + if propagate_sgr and captured_style is None: captured_style = current_style idx += 1 continue # SGR: update current_style, do not emit. - if m.group('sgr_params') is not None and _track_sgr and current_style is not None: + if m.group('sgr_params') is not None and propagate_sgr and current_style is not None: current_style = _sgr_state_update(current_style, m.group()) idx = m.end() continue # OSC 8 hyperlink. if hl_state := HyperlinkParams.parse(m.group()): - r = _process_hyperlink(ctx, hl_state, m.end(), col) + r = _process_hyperlink( + text, start, end, fillchar, tabsize, ambiguous_width, + control_codes, + params=hl_state, match_end=m.end(), col=col, + ) if r.action is _HyperlinkAction.NO_CLOSE: sequences.append((col, seq_order, m.group())) seq_order += 1 - if _track_sgr and captured_style is None: + if propagate_sgr and captured_style is None: captured_style = current_style idx = m.end() elif r.action is _HyperlinkAction.EMPTY: @@ -393,7 +408,7 @@ def _write_cells(s: str, w: int, write_col: int, else: sequences.append((col, seq_order, r.open_seq)) seq_order += 1 - if _track_sgr and captured_style is None: + if propagate_sgr and captured_style is None: captured_style = current_style _write_cells(r.clipped_inner, r.clipped_width, col, is_hyperlink=True) @@ -406,7 +421,7 @@ def _write_cells(s: str, w: int, write_col: int, # Indeterminate-effect sequences: raise in strict mode. seq = m.group() - if ctx.strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): + if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): raise ValueError( f"Indeterminate cursor sequence at position {idx}, " f"{seq!r}" @@ -422,9 +437,9 @@ def _write_cells(s: str, w: int, write_col: int, if (cforward_n := m.group('cforward_n')) is not None: n_forward = int(cforward_n) if cforward_n else 1 move_end = col + n_forward - if col < _end and move_end > _start: - for i in range(max(col, _start), min(move_end, _end)): - _write_cells(_fillchar, 1, i) + if col < end and move_end > start: + for i in range(max(col, start), min(move_end, end)): + _write_cells(fillchar, 1, i) col = move_end idx = m.end() continue @@ -432,7 +447,7 @@ def _write_cells(s: str, w: int, write_col: int, # Cursor Backward (CSI n D). if (cbackward_n := m.group('cbackward_n')) is not None: n_backward = int(cbackward_n) if cbackward_n else 1 - if ctx.strict and n_backward > col: + if strict and n_backward > col: raise ValueError( f"Cursor left movement at position {idx} would move " f"{n_backward} cells left from column {col}, " @@ -445,7 +460,7 @@ def _write_cells(s: str, w: int, write_col: int, # Any other recognized sequence: preserve as-is. sequences.append((col, seq_order, m.group())) seq_order += 1 - if _track_sgr and captured_style is None: + if propagate_sgr and captured_style is None: captured_style = current_style idx = m.end() continue @@ -465,43 +480,43 @@ def _write_cells(s: str, w: int, write_col: int, # Tab expansion. if char == '\t': - if ctx.tabsize > 0: - next_tab = col + (ctx.tabsize - (col % ctx.tabsize)) + if tabsize > 0: + next_tab = col + (tabsize - (col % tabsize)) while col < next_tab: - if _start <= col < _end: - _write_cells(_fillchar, 1, col) + if start <= col < end: + _write_cells(fillchar, 1, col) col += 1 else: sequences.append((col, seq_order, '\t')) seq_order += 1 - if _track_sgr and captured_style is None: + if propagate_sgr and captured_style is None: captured_style = current_style idx += 1 continue # Grapheme cluster. - grapheme = next(iter_graphemes(_text, start=idx)) - grapheme_w = width(grapheme, ambiguous_width=_ambw) + grapheme = next(iter_graphemes(text, start=idx)) + grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) # Emit grapheme or fillchar depending on visibility within clip window. if grapheme_w == 0: - if _start <= col < _end: + if start <= col < end: sequences.append((col, seq_order, grapheme)) seq_order += 1 - if _track_sgr and captured_style is None: + if propagate_sgr and captured_style is None: captured_style = current_style - elif col >= _start and col + grapheme_w <= _end: + elif col >= start and col + grapheme_w <= end: _write_cells(grapheme, grapheme_w, col) - elif col < _end and col + grapheme_w > _start: - clip_start = max(_start, col) - for offset in range(min(_end, col + grapheme_w) - clip_start): - _write_cells(_fillchar, 1, clip_start + offset) + elif col < end and col + grapheme_w > start: + clip_start = max(start, col) + for offset in range(min(end, col + grapheme_w) - clip_start): + _write_cells(fillchar, 1, clip_start + offset) col += grapheme_w idx += len(grapheme) result = _reconstruct_painter( - cells, sequences, _start, _end, _fillchar, + cells, sequences, start, end, fillchar, ) return result, captured_style @@ -516,6 +531,7 @@ def clip( ambiguous_width: int = 1, propagate_sgr: bool = True, control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', + overtyping: Optional[bool] = None, ) -> str: r""" Clip text to display columns ``(start, end)`` while preserving all terminal sequences. @@ -564,6 +580,14 @@ def clip( - ``'ignore'``: All control characters are treated as zero-width. Cursor movement is not tracked (fastest path). + :param overtyping: Whether to use the painter's algorithm for cursor + movement (``\b`` backspace, ``\r`` carriage return, and CSI cursor + left/right/position sequences). When ``None`` (default), auto-detects + by scanning for these characters in *text*. Set to ``False`` for improved + performance when the caller knows *text* contains no cursor movement + characters. Set to ``True`` to force the painter's algorithm (useful + for testing). Has no effect when ``control_codes='ignore'``. + :returns: Substring of ``text`` spanning display columns ``(start, end)``, with all terminal sequences preserved and wide characters at boundaries replaced with ``fillchar``. @@ -588,6 +612,9 @@ def clip( Added ``control_codes`` parameter (default 'parse'). OSC 8 hyperlink-aware clipping. + .. versionchanged:: 0.8.0 + Added ``overtyping`` parameter (default None, auto-detect). + Example:: >>> clip('hello world', 0, 5) @@ -610,23 +637,27 @@ def clip( if propagate_sgr and not has_esc: propagate_sgr = False - # Use painter's algorithm only when cursor movement can overwrite cells. - fn_clip = _clip_painter if ( - control_codes != 'ignore' and - ('\x08' in text or '\r' in text or - (has_esc and bool(_HORIZONTAL_CURSOR_MOVEMENT.search(text)))) - ) else _clip_simple + # Determine whether painter's algorithm is needed. + if overtyping is None: + # Auto-detect: scan for cursor movement characters. + overtyping = ( + control_codes != 'ignore' and + ('\x08' in text or '\r' in text or + (has_esc and bool(_HORIZONTAL_CURSOR_MOVEMENT.search(text)))) + ) + elif overtyping and control_codes == 'ignore': + overtyping = False # control_codes='ignore' overrides + + fn_clip = _clip_painter if overtyping else _clip_simple - ctx = _ClipContext( + return _apply_sgr_wrap(*fn_clip( text=text, start=start, end=end, + propagate_sgr=propagate_sgr, + ambiguous_width=ambiguous_width, fillchar=fillchar, tabsize=tabsize, - ambiguous_width=ambiguous_width, - control_codes=control_codes, strict=(control_codes == 'strict'), - propagate_sgr=propagate_sgr, - ) - - return _apply_sgr_wrap(*fn_clip(ctx)) + control_codes=control_codes, + )) From fd7dc4437c062476ea4d692d1482c6816de729a5 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 14:41:13 -0400 Subject: [PATCH 61/70] refactor test_clip_cursors -> test_clip_overtyping --- tests/test_clip.py | 67 +++++++++++++++++++ ...lip_cursors.py => test_clip_overtyping.py} | 62 +++-------------- wcwidth/_clip.py | 40 +++++++---- 3 files changed, 103 insertions(+), 66 deletions(-) rename tests/{test_clip_cursors.py => test_clip_overtyping.py} (75%) diff --git a/tests/test_clip.py b/tests/test_clip.py index d7369a0..8ab3f1d 100644 --- a/tests/test_clip.py +++ b/tests/test_clip.py @@ -385,3 +385,70 @@ def test_clip_control_chars_zero_width(text, start, end, expected): def test_clip_tab_first_visible_with_sgr(): """Tab as first visible character with SGR propagation.""" assert clip('\x1b[31m\tb', 0, 4, tabsize=8) == '\x1b[31m \x1b[0m' + + +def test_clip_overtyping_override_by_control_codes_ignore(): + """When overtyping=True and control_codes='ignore', overtyping is overridden to False.""" + # elif entered: overtyping=True + control_codes='ignore' → overtyping=False + assert clip('hello world', 0, 5, overtyping=True, control_codes='ignore') == 'hello' + # Verify that overtyping is actually disabled: cursor movement chars are + # treated as zero-width, so the result is the same as without overtyping. + assert clip('ab\x08cd', 0, 4, overtyping=True, control_codes='ignore') == 'ab\x08cd' + + +def test_clip_overtyping_without_ignore(): + """When overtyping=True and control_codes='parse', elif is not entered.""" + # elif skipped: overtyping=True + control_codes='parse' → overtyping stays True + # The painter path is used, cursor movement sequences affect output. + assert clip('ab\x1b[2Dcd', 0, 4, overtyping=True, control_codes='parse') == 'cd' + + +# Indeterminate-effect sequences that raise ValueError in strict mode +# (matching width() behavior). These are not cursor-movement sequences, +# so they exercise the simple (non-overtyping) path. + +INDETERMINATE_SEQUENCES = [ + ('\x1b[K', 'erase_in_line'), + ('\x1b[2K', 'erase_in_line_params'), + ('\x1b[J', 'erase_in_display'), + ('\x1b[2J', 'erase_in_display_params'), + ('\x1b[H', 'cursor_home'), + ('\x1b[1;1H', 'cursor_address'), + ('\x1b[A', 'cursor_up'), + ('\x1b[2A', 'cursor_up_params'), + ('\x1b[B', 'cursor_down'), + ('\x1b[5B', 'cursor_down_params'), + ('\x1b[P', 'delete_character'), + ('\x1b[1P', 'parm_dch'), + ('\x1b[M', 'delete_line'), + ('\x1b[1M', 'parm_delete_line'), + ('\x1b[L', 'insert_line'), + ('\x1b[1L', 'parm_insert_line'), + ('\x1b[@', 'insert_character'), + ('\x1b[1X', 'erase_chars'), + ('\x1b[S', 'scroll_up'), + ('\x1b[T', 'scroll_down'), + ('\x1b[?1049h', 'enter_fullscreen'), + ('\x1b[?1049l', 'exit_fullscreen'), + ('\x1bD', 'scroll_forward'), + ('\x1bM', 'scroll_reverse'), + ('\x1b8', 'restore_cursor'), + ('\x1bc', 'full_reset'), +] + + +@pytest.mark.parametrize('seq,cap_name', INDETERMINATE_SEQUENCES) +def test_clip_strict_indeterminate_raises(seq, cap_name): + """Clip() strict mode raises ValueError on indeterminate-effect sequences.""" + with pytest.raises(ValueError, match='Indeterminate cursor sequence'): + clip(f'hello{seq}world', 0, 10, control_codes='strict') + + +@pytest.mark.parametrize('seq,cap_name', INDETERMINATE_SEQUENCES) +def test_clip_parse_indeterminate_preserved(seq, cap_name): + """Clip() parse mode preserves indeterminate sequences as zero-width.""" + result = clip(f'hello{seq}world', 0, 10, control_codes='parse') + # The sequence is preserved, visible text is hello + world = 10 chars + assert 'hello' in result + assert 'world' in result + assert seq in result diff --git a/tests/test_clip_cursors.py b/tests/test_clip_overtyping.py similarity index 75% rename from tests/test_clip_cursors.py rename to tests/test_clip_overtyping.py index ca32146..9206e77 100644 --- a/tests/test_clip_cursors.py +++ b/tests/test_clip_overtyping.py @@ -1,8 +1,14 @@ """ -Tests for clip() handling of cursor left/right sequences (CSI C / CSI D). +Tests for clip()'s overtyping (painter) path. -These tests codify expected visible results when cursor movement sequences affect horizontal -positions. They are intentionally specific and will drive future implementation changes in clip(). +The painter algorithm is used when the text contains cursor movement sequences +(CSI n C/D, backspace, carriage return, HPA) that require column-level tracking +to determine the final visible output. Auto-detection of the overtyping path +happens in clip() via the presence of \\x08, \\r, or horizontal cursor movement +escape sequences, or can be forced with ``overtyping=True``. + +These tests codify expected visible results when cursor movement sequences +affect horizontal positions. """ # 3rd party @@ -122,56 +128,6 @@ def test_clip_cursor_left_out_of_bounds_parse_no_raise(): assert clip('ab\x1b[99Dcd', 0, 4) == 'cd' -# Indeterminate-effect sequences that raise ValueError in strict mode -# (matching width() behavior). - -INDETERMINATE_SEQUENCES = [ - ('\x1b[K', 'erase_in_line'), - ('\x1b[2K', 'erase_in_line_params'), - ('\x1b[J', 'erase_in_display'), - ('\x1b[2J', 'erase_in_display_params'), - ('\x1b[H', 'cursor_home'), - ('\x1b[1;1H', 'cursor_address'), - ('\x1b[A', 'cursor_up'), - ('\x1b[2A', 'cursor_up_params'), - ('\x1b[B', 'cursor_down'), - ('\x1b[5B', 'cursor_down_params'), - ('\x1b[P', 'delete_character'), - ('\x1b[1P', 'parm_dch'), - ('\x1b[M', 'delete_line'), - ('\x1b[1M', 'parm_delete_line'), - ('\x1b[L', 'insert_line'), - ('\x1b[1L', 'parm_insert_line'), - ('\x1b[@', 'insert_character'), - ('\x1b[1X', 'erase_chars'), - ('\x1b[S', 'scroll_up'), - ('\x1b[T', 'scroll_down'), - ('\x1b[?1049h', 'enter_fullscreen'), - ('\x1b[?1049l', 'exit_fullscreen'), - ('\x1bD', 'scroll_forward'), - ('\x1bM', 'scroll_reverse'), - ('\x1b8', 'restore_cursor'), - ('\x1bc', 'full_reset'), -] - - -@pytest.mark.parametrize('seq,cap_name', INDETERMINATE_SEQUENCES) -def test_clip_strict_indeterminate_raises(seq, cap_name): - """Clip() strict mode raises ValueError on indeterminate-effect sequences.""" - with pytest.raises(ValueError, match='Indeterminate cursor sequence'): - clip(f'hello{seq}world', 0, 10, control_codes='strict') - - -@pytest.mark.parametrize('seq,cap_name', INDETERMINATE_SEQUENCES) -def test_clip_parse_indeterminate_preserved(seq, cap_name): - """Clip() parse mode preserves indeterminate sequences as zero-width.""" - result = clip(f'hello{seq}world', 0, 10, control_codes='parse') - # The sequence is preserved, visible text is hello + world = 10 chars - assert 'hello' in result - assert 'world' in result - assert seq in result - - def test_clip_strict_cr_allowed(): """Carriage return is allowed in strict mode (text begins at column 0).""" assert clip('hello\rworld', 0, 5, control_codes='strict') == 'world' diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index 28254a7..a22430a 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -206,7 +206,15 @@ def _clip_simple( output: list[str] = [] col = 0 idx = 0 - captured_style = None # snapshot of current_style at first visible character + # captured_style is a frozen snapshot of current_style taken at the first + # visible character emitted within the clip window (start, end). It stays + # None until that point. current_style, by contrast, is continuously + # updated by SGR sequences throughout the scan. The snapshot is what the + # caller uses to wrap the result in the correct SGR state. + # + # When propagate_sgr is False, current_style (and therefore captured_style) + # remain None, and SGR sequences pass through as literal text. + captured_style: Optional[_SGRState] = None current_style = _SGR_STATE_DEFAULT if propagate_sgr else None while idx < len(text): @@ -216,12 +224,15 @@ def _clip_simple( if col >= end and char not in '\r\x08\t\x1b': if captured_style is not None: break - if not propagate_sgr: - next_esc = text.find('\x1b', idx + 1) - if next_esc == -1: - break - idx = next_esc - continue + # propagate_sgr is always False here: with propagate_sgr=True, + # captured_style is set on the first visible emission in the + # clip window and we would have broken above. The skip-ahead + # optimization is only needed (and safe) when SGR tracking is off. + next_esc = text.find('\x1b', idx + 1) + if next_esc == -1: + break + idx = next_esc + continue if char == '\x1b': m = _SEQUENCE_CLASSIFY.match(text, idx) @@ -339,7 +350,14 @@ def _clip_painter( col = 0 idx = 0 - captured_style = None # snapshot of current_style at first visible character + # captured_style is a frozen snapshot of current_style taken at the first + # visible character emitted within the clip window (start, end). It stays + # None until that point. current_style, by contrast, is continuously + # updated by SGR sequences throughout the scan. + # + # When propagate_sgr is False, current_style (and therefore captured_style) + # remain None, and SGR sequences pass through as literal text. + captured_style: Optional[_SGRState] = None current_style = _SGR_STATE_DEFAULT if propagate_sgr else None def _write_cells(s: str, w: int, write_col: int, @@ -515,10 +533,7 @@ def _write_cells(s: str, w: int, write_col: int, col += grapheme_w idx += len(grapheme) - result = _reconstruct_painter( - cells, sequences, start, end, fillchar, - ) - return result, captured_style + return _reconstruct_painter(cells, sequences, start, end, fillchar), captured_style def clip( @@ -647,7 +662,6 @@ def clip( ) elif overtyping and control_codes == 'ignore': overtyping = False # control_codes='ignore' overrides - fn_clip = _clip_painter if overtyping else _clip_simple return _apply_sgr_wrap(*fn_clip( From 2b899649449b8c72ae9fbbf3f80e018ca66addc4 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 14:46:38 -0400 Subject: [PATCH 62/70] add many cc=ignore and two overtyping=True benchmarks --- tests/test_benchmarks.py | 65 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index b878578..8d1375c 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -359,6 +359,71 @@ def test_clip_dense_ansi_no_propagate_no_overtype(benchmark): benchmark(wcwidth.clip, text, 6, 30, propagate_sgr=False, overtyping=False) +def test_clip_dense_ansi_overtype(benchmark): + """Benchmark clip() with dense ANSI, overtyping forced (painter path).""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, overtyping=True) + + +def test_clip_long_cjk_overtype(benchmark): + """Benchmark clip() with long CJK, overtyping forced (painter path).""" + text = '中文测试字符串' * 100 + benchmark(wcwidth.clip, text, 0, 50, overtyping=True) + + +def test_width_dense_ansi_control_codes_ignore(benchmark): + """Benchmark width() with dense ANSI and control_codes='ignore'.""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.width, text, control_codes='ignore') + + +def test_width_complex_ansi_control_codes_ignore(benchmark): + """Benchmark width() with complex ANSI and control_codes='ignore'.""" + text = '\x1b[38;2;255;150;100mWARN\x1b[0m: \x1b[1mBold\x1b[0m \x1b[4mUnderline\x1b[0m' + benchmark(wcwidth.width, text, control_codes='ignore') + + +def test_clip_dense_ansi_control_codes_ignore(benchmark): + """Benchmark clip() with dense ANSI, control_codes='ignore' (skips painter/OSC).""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, control_codes='ignore') + + +def test_clip_long_cjk_control_codes_ignore(benchmark): + """Benchmark clip() with long CJK and control_codes='ignore' (early-exit path).""" + text = '中文测试字符串' * 100 + benchmark(wcwidth.clip, text, 0, 50, control_codes='ignore') + + +def test_clip_cursor_cr_control_codes_ignore(benchmark): + """Benchmark clip() with CR overwrite and control_codes='ignore' (painter skipped).""" + text = 'hello\rworld ' * 20 + benchmark(wcwidth.clip, text, 0, 50, control_codes='ignore') + + +def test_clip_dense_ansi_no_propagate_control_codes_ignore(benchmark): + """Benchmark clip() with dense ANSI, propagate_sgr=False and control_codes='ignore'.""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, propagate_sgr=False, control_codes='ignore') + + +def test_clip_long_ascii_control_codes_ignore(benchmark): + """Benchmark clip() with long ASCII and control_codes='ignore' (fast-path slice).""" + text = 'hello world ' * 1000 + benchmark(wcwidth.clip, text, 500, 600, control_codes='ignore') + + +def test_wrap_with_ansi_control_codes_ignore(benchmark): + """Benchmark wrap() with ANSI sequences and control_codes='ignore'.""" + text = '\x1b[31mThe quick brown fox jumps over the lazy dog.\x1b[0m Did it really? ' * 20 + benchmark(wcwidth.wrap, text, 40, control_codes='ignore') + + +def test_ljust_ascii_control_codes_ignore(benchmark): + """Benchmark ljust() with ASCII and control_codes='ignore'.""" + benchmark(wcwidth.ljust, 'hello', 20, control_codes='ignore') + + def test_propagate_sgr_multiline(benchmark): """Benchmark propagate_sgr() with multiple lines.""" lines = ['\x1b[1;31mline one', 'line two', 'line three\x1b[0m'] From d2b12ea7f43bf3cf969b38a69d9be628b72e8388 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 15:47:03 -0400 Subject: [PATCH 63/70] add missing coverage, don't ignore old wcwidth.wcwidth import path, exercise it --- bin/new-wide-by-version.py | 49 ----------------------------------- tests/test_clip_overtyping.py | 16 ++++++++++++ tests/test_core.py | 14 ++++++++++ tox.ini | 1 - 4 files changed, 30 insertions(+), 50 deletions(-) delete mode 100755 bin/new-wide-by-version.py diff --git a/bin/new-wide-by-version.py b/bin/new-wide-by-version.py deleted file mode 100755 index 3ba85cc..0000000 --- a/bin/new-wide-by-version.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -""" -Display new wide unicode point values, by version. - -For example:: - - "5.0.0": [ - 12752, - 12753, - 12754, - ... - -Means that chr(12752) through chr(12754) are new WIDE values -for Unicode version 5.0.0, and were not WIDE values for the -previous version (4.1.0). -""" - -# std imports -import sys -import json - -# local -from wcwidth import WIDE_EASTASIAN, _bisearch - - -def main(): - """List new WIDE characters at each unicode version.""" - versions = list(WIDE_EASTASIAN.keys()) - results = {} - for version in versions: - prev_idx = versions.index(version) - 1 - if prev_idx == -1: - continue - previous_version = versions[prev_idx] - previous_table = WIDE_EASTASIAN[previous_version] - for value_pair in WIDE_EASTASIAN[version]: - for value in range(*value_pair): - if not _bisearch(value, previous_table): - results[version] = results.get(version, []) + [value] - if '--debug' in sys.argv: - print(f'version {version} has unicode character ' - f'0x{value:05x} ({chr(value)}) but previous ' - f'version, {previous_version} does not.', - file=sys.stderr) - print(json.dumps(results, indent=4)) - - -if __name__ == '__main__': - main() diff --git a/tests/test_clip_overtyping.py b/tests/test_clip_overtyping.py index 9206e77..1d106ba 100644 --- a/tests/test_clip_overtyping.py +++ b/tests/test_clip_overtyping.py @@ -103,6 +103,16 @@ ('\x1b[5C\x1b]8;;http://example.com\x07', 0, 5, {'propagate_sgr': False}, ' \x1b]8;;http://example.com\x07'), # Trailing sequences past col_limit (line 374) ('\x1b[5C\x1b]8;;http://example.com\x07', 0, 3, {'propagate_sgr': False}, ' \x1b]8;;http://example.com\x07'), + # Lone ESC as first visible thing in painter (captured_style = current_style, line 398) + ('\x1b[D\x1b\x1bXy', 0, 3, {}, '\x1b\x1bXy'), + # Hyperlink VISIBLE after captured_style already set + ('a\x1b[C\x1b]8;;http://x\x07hi\x1b]8;;\x07', 0, 5, {}, 'a \x1b]8;;http://x\x07hi\x1b]8;;\x07'), + # Tab with tabsize=0 as first visible thing in painter + ('\x1b[D\tab', 0, 2, {'tabsize': 0}, '\tab'), + # Zero-width grapheme as first visible thing in painter + ('\x1b[D\u0301x', 0, 3, {}, '\u0301x'), + # Generic escape sequence as first visible in painter + ('\x1b[D\x1b[Hxy', 0, 3, {}, '\x1b[Hxy'), ]) def test_clip_cursor_sequences_expected_behaviour(text, start, end, kwargs, expected): """Verify clip() output matches terminal-visible columns after cursor moves.""" @@ -141,3 +151,9 @@ def test_clip_strict_hpa_allowed(): def test_clip_strict_cursor_left_allowed(): """Cursor-left within bounds is allowed in strict mode.""" assert clip('hello\x1b[2Dxy', 0, 5, control_codes='strict') == 'helxy' + + +def test_clip_strict_indeterminate_sequence_painter(): + """Clip() strict-mode raises on indeterminate sequence in painter path.""" + with pytest.raises(ValueError, match='Indeterminate cursor sequence'): + clip('a\x1b[D\x1b[Hb', 0, 3, control_codes='strict') diff --git a/tests/test_core.py b/tests/test_core.py index 3208cbd..8f89735 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -454,6 +454,11 @@ def test_virama_conjunct(phrase, expected): assert wcwidth.width(phrase) == expected +def test_zwj_at_end_of_string(): + """ZWJ at end of string (not after virama) is consumed with zero width.""" + assert wcwidth.wcswidth('a\u200D') == 1 + + def test_soft_hyphen(): # Test SOFT HYPHEN, category 'Cf' usually are zero-width, but most # implementations agree to draw it was '1' cell, visually @@ -483,3 +488,12 @@ def test_prepended_concatenation_mark_width(codepoint, name): """Prepended Concatenation Marks have width 1, not 0.""" # https://github.com/jquast/wcwidth/issues/119 assert wcwidth.wcwidth(chr(codepoint)) == 1 + + +def test_legacy_module(): + """Verify legacy ``wcwidth.wcwidth`` module is importable and all public items resolve.""" + # local + import wcwidth.wcwidth as legacy + + for name in legacy.__all__: + assert getattr(legacy, name) is not None, f"legacy.wcwidth.{name} is None" diff --git a/tox.ini b/tox.ini index 7c4d526..8c5d19a 100644 --- a/tox.ini +++ b/tox.ini @@ -55,7 +55,6 @@ relative_files = True [coverage:report] omit = tests/* - wcwidth/wcwidth.py exclude_lines = pragma: no cover precision = 1 From 251a319fd3c98057c8f21f7f54cf70305ed2fc35 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 15:51:03 -0400 Subject: [PATCH 64/70] pylint --- tests/test_core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_core.py b/tests/test_core.py index 8f89735..9540ca0 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -492,6 +492,7 @@ def test_prepended_concatenation_mark_width(codepoint, name): def test_legacy_module(): """Verify legacy ``wcwidth.wcwidth`` module is importable and all public items resolve.""" + # pylint: disable=import-outside-toplevel # local import wcwidth.wcwidth as legacy From ea8fffce3c536c5447b4ba02c3ef44a24447eefa Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 16:26:57 -0400 Subject: [PATCH 65/70] now i'm 100% certain of no possible ill-effects of refactor --- tests/test_core.py | 23 +++++++++++++++++------ wcwidth/__init__.py | 10 ++++++++-- wcwidth/wcwidth.py | 1 + 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 9540ca0..dd1e3b7 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -491,10 +491,21 @@ def test_prepended_concatenation_mark_width(codepoint, name): def test_legacy_module(): - """Verify legacy ``wcwidth.wcwidth`` module is importable and all public items resolve.""" + """Verify legacy ``wcwidth.wcwidth`` module's public items are importable.""" # pylint: disable=import-outside-toplevel - # local - import wcwidth.wcwidth as legacy - - for name in legacy.__all__: - assert getattr(legacy, name) is not None, f"legacy.wcwidth.{name} is None" + # std imports + import sys + + # Access the legacy submodule via sys.modules (matching 0.6.0 where + # 'import wcwidth.wcwidth' returned the function, not the module). + _legacy = sys.modules['wcwidth.wcwidth'] + + for name in _legacy.__all__: + attr = getattr(_legacy, name) + assert attr is not None, f"wcwidth.wcwidth.{name} is None" + + # Verify that individual imports from the legacy path also work, + # e.g. 'from wcwidth.wcwidth import wcswidth' + for name in _legacy.__all__: + obj = getattr(_legacy, name) + assert obj is not None, f"could not import {name} from wcwidth.wcwidth" diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 871b23e..bc55a1d 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -26,13 +26,19 @@ from .escape_sequences import iter_sequences, strip_sequences from .unicode_versions import list_versions +# Pre-import the legacy submodule so that sys.modules['wcwidth.wcwidth'] is populated during package +# initialization. This matches the 0.6.0 behavior where 'from .wcwidth import wcwidth' would have +# already loaded the submodule. Without this, a later 'import wcwidth.wcwidth' triggers on-disk +# file discovery which rebinds wcwidth.wcwidth from the function to the module object. +from . import wcwidth as _wcwidth_module # isort:skip + + # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". __all__ = ('wcwidth', 'wcswidth', 'width', 'iter_sequences', 'iter_graphemes', 'iter_graphemes_reverse', 'grapheme_boundary_before', 'ljust', 'rjust', 'center', 'wrap', 'clip', 'strip_sequences', - 'list_versions', 'propagate_sgr', - 'Hyperlink', 'HyperlinkParams') + 'list_versions', 'propagate_sgr', 'Hyperlink', 'HyperlinkParams') # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index 6afb3ac..e4895e9 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -6,6 +6,7 @@ from wcwidth.wcwidth import iter_graphemes from wcwidth.wcwidth import _SGR_PATTERN + import wcwidth.wcwidth as legacy """ # pylint: disable=unused-import From 91eae65e984677fcbfe35fbcbabe3e61623d3ada Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 16:34:16 -0400 Subject: [PATCH 66/70] fix import order --- wcwidth/__init__.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index bc55a1d..da85400 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -12,7 +12,20 @@ from ._clip import clip from .align import ljust, rjust, center from ._width import width -from ._wcwidth import wcwidth, _wcmatch_version, _wcversion_value + +# Pre-import the legacy submodule so that sys.modules['wcwidth.wcwidth'] is +# populated during package initialization. This matches the 0.6.0 behavior +# where ``from .wcwidth import wcwidth`` would have already loaded the +# submodule. Without this, a later ``import wcwidth.wcwidth`` triggers +# on-disk file discovery which rebinds wcwidth.wcwidth from the function to +# the module object. +# +# NOTE: this must precede ``from ._wcwidth import wcwidth`` — if wcwidth +# already exists as a package attribute, Python short-circuits and returns +# the existing attribute instead of loading the submodule. +from . import wcwidth as _wcwidth_module # isort:skip + +from ._wcwidth import wcwidth, _wcmatch_version, _wcversion_value # isort:skip from .bisearch import bisearch as _bisearch from .grapheme import iter_graphemes, iter_graphemes_reverse, grapheme_boundary_before from .textwrap import SequenceTextWrapper, wrap @@ -26,12 +39,6 @@ from .escape_sequences import iter_sequences, strip_sequences from .unicode_versions import list_versions -# Pre-import the legacy submodule so that sys.modules['wcwidth.wcwidth'] is populated during package -# initialization. This matches the 0.6.0 behavior where 'from .wcwidth import wcwidth' would have -# already loaded the submodule. Without this, a later 'import wcwidth.wcwidth' triggers on-disk -# file discovery which rebinds wcwidth.wcwidth from the function to the module object. -from . import wcwidth as _wcwidth_module # isort:skip - # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". From 1e5f19582b666ac7fa907f698757cebc1d14e87b Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 16:37:10 -0400 Subject: [PATCH 67/70] alright, fixadoodle --- wcwidth/__init__.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index da85400..2bab0b5 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -12,20 +12,6 @@ from ._clip import clip from .align import ljust, rjust, center from ._width import width - -# Pre-import the legacy submodule so that sys.modules['wcwidth.wcwidth'] is -# populated during package initialization. This matches the 0.6.0 behavior -# where ``from .wcwidth import wcwidth`` would have already loaded the -# submodule. Without this, a later ``import wcwidth.wcwidth`` triggers -# on-disk file discovery which rebinds wcwidth.wcwidth from the function to -# the module object. -# -# NOTE: this must precede ``from ._wcwidth import wcwidth`` — if wcwidth -# already exists as a package attribute, Python short-circuits and returns -# the existing attribute instead of loading the submodule. -from . import wcwidth as _wcwidth_module # isort:skip - -from ._wcwidth import wcwidth, _wcmatch_version, _wcversion_value # isort:skip from .bisearch import bisearch as _bisearch from .grapheme import iter_graphemes, iter_graphemes_reverse, grapheme_boundary_before from .textwrap import SequenceTextWrapper, wrap @@ -39,6 +25,17 @@ from .escape_sequences import iter_sequences, strip_sequences from .unicode_versions import list_versions +# Pre-import the legacy submodule so that sys.modules['wcwidth.wcwidth'] is +# populated during package initialization. This matches the 0.6.0 behavior +# where ``from .wcwidth import wcwidth`` would have already loaded the +# submodule. Without this, a later ``import wcwidth.wcwidth`` triggers +# on-disk file discovery which rebinds wcwidth.wcwidth from the function to +# the module object. +# +# NOTE: this sort order is important for legacy import API compatibility before release 0.7.0 +from . import wcwidth as _wcwidth_module # isort:skip +from ._wcwidth import wcwidth, _wcmatch_version, _wcversion_value # isort:skip + # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". From a3155a95ac4e346be1990db5449f7552ea5f8669 Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 17:04:41 -0400 Subject: [PATCH 68/70] skip pedantic tests without cospeed installed --- tests/test_benchmarks.py | 12 ++++++++++++ wcwidth/textwrap.py | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 8d1375c..e6d3b44 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -514,6 +514,8 @@ def test_width_brahmic_bengali(benchmark): @_py38_skip_pedantic def test_wrap_udhr(benchmark): """Benchmark wrap() with multilingual UDHR text.""" + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') result = benchmark.pedantic(wcwidth.wrap, args=(UDHR_TEXT, 80), rounds=1, iterations=1) assert len(result) assert all(0 <= wcwidth.width(_l) <= 80 for _l in result) @@ -523,6 +525,8 @@ def test_wrap_udhr(benchmark): @_py38_skip_pedantic def test_width_udhr(benchmark): """Benchmark width() with multilingual UDHR text.""" + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') result = benchmark.pedantic(wcwidth.width, args=(UDHR_TEXT,), rounds=1, iterations=1) assert result > 0 @@ -531,6 +535,8 @@ def test_width_udhr(benchmark): @_py38_skip_pedantic def test_width_udhr_lines(benchmark): """Benchmark width() on individual UDHR lines.""" + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') result = benchmark.pedantic(lambda: sum(wcwidth.width(line) for line in UDHR_LINES), rounds=1, iterations=1) assert result > 0 @@ -551,6 +557,8 @@ def check(): if w != wcs: failures.append((line[:60], w, wcs)) return failures + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') failures = benchmark.pedantic(check, rounds=1, iterations=1) assert not failures @@ -568,6 +576,8 @@ def check(): parse_total = sum(wcwidth.width(line) for line in UDHR_LINES) return fast_total, parse_total + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') fast_total, parse_total = benchmark.pedantic(check, rounds=1, iterations=1) _width_module._WIDTH_FAST_PATH_MIN_LEN = saved assert fast_total == parse_total @@ -577,6 +587,8 @@ def check(): @_py38_skip_pedantic def test_ljust_udhr_lines(benchmark): """Benchmark ljust() on UDHR lines.""" + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') benchmark.pedantic(lambda: [wcwidth.ljust(line, w + 1, UDHR_FILLCHAR) for line, w in zip(UDHR_LINES, UDHR_WIDTHS)], rounds=1, iterations=1) diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index 93eaec8..02cc31d 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -306,8 +306,8 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m if 'id=' in new_state.params: current_hyperlink_id = new_state.params elif new_state.params: - # Prepend id to existing params (per OSC 8 spec, params can have - # multiple key=value pairs separated by :) + # Prepend id to existing params. Per OSC 8 spec, params can have + # multiple key=value pairs separated by ':'. current_hyperlink_id = ( f'id={self._next_hyperlink_id()}:{new_state.params}') else: From a2dbb945c6abe1872b3851766d873ef4a51dea3a Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 18:04:44 -0400 Subject: [PATCH 69/70] bugfix version --- wcwidth/_clip.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py index a22430a..aa26807 100644 --- a/wcwidth/_clip.py +++ b/wcwidth/_clip.py @@ -626,8 +626,6 @@ def clip( .. versionchanged:: 0.7.0 Added ``control_codes`` parameter (default 'parse'). OSC 8 hyperlink-aware clipping. - - .. versionchanged:: 0.8.0 Added ``overtyping`` parameter (default None, auto-detect). Example:: From 495f1fa60f7dfa59eb21391f56c210baa865b73a Mon Sep 17 00:00:00 2001 From: Jeff Quast Date: Fri, 1 May 2026 18:30:15 -0400 Subject: [PATCH 70/70] nit --- wcwidth/_wcswidth.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/wcwidth/_wcswidth.py b/wcwidth/_wcswidth.py index 13bae8e..eab9b74 100644 --- a/wcwidth/_wcswidth.py +++ b/wcwidth/_wcswidth.py @@ -51,8 +51,7 @@ def wcswidth( """ # pylint: disable=unused-argument,too-many-locals,too-many-statements # pylint: disable=too-complex,too-many-branches,duplicate-code - # This function intentionally keeps all logic inline for performance — - # local variable state tracking avoids per-character method-call overhead. + # This function intentionally keeps all logic inline for performance. # Fast path: pure ASCII printable strings are always width == length if n is None and pwcs.isascii() and pwcs.isprintable():