diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b51e64ff..f7430c8d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,10 +72,6 @@ jobs: - "3.12" - "3.13" - "3.14" - - "pypy-3.8" - - "pypy-3.9" - - "pypy-3.10" - - "pypy-3.11" runs-on: ${{ matrix.os }} container: ${{ matrix.container }} diff --git a/bin/new-wide-by-version.py b/bin/new-wide-by-version.py deleted file mode 100755 index b0ec5ad8..00000000 --- a/bin/new-wide-by-version.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -""" -Display new wide unicode point values, by version. - -For example:: - - "5.0.0": [ - 12752, - 12753, - 12754, - ... - -Means that chr(12752) through chr(12754) are new WIDE values -for Unicode version 5.0.0, and were not WIDE values for the -previous version (4.1.0). -""" -# std imports -import sys -import json - -# local -from wcwidth import WIDE_EASTASIAN, _bisearch - - -def main(): - """List new WIDE characters at each unicode version.""" - versions = list(WIDE_EASTASIAN.keys()) - results = {} - for version in versions: - prev_idx = versions.index(version) - 1 - if prev_idx == -1: - continue - previous_version = versions[prev_idx] - previous_table = WIDE_EASTASIAN[previous_version] - for value_pair in WIDE_EASTASIAN[version]: - for value in range(*value_pair): - if not _bisearch(value, previous_table): - results[version] = results.get(version, []) + [value] - if '--debug' in sys.argv: - print(f'version {version} has unicode character ' - f'0x{value:05x} ({chr(value)}) but previous ' - f'version, {previous_version} does not.', - file=sys.stderr) - print(json.dumps(results, indent=4)) - - -if __name__ == '__main__': - main() diff --git a/bin/update-tables.py b/bin/update-tables.py index 9383703d..2e53a2e4 100644 --- a/bin/update-tables.py +++ b/bin/update-tables.py @@ -1,13 +1,14 @@ #!/usr/bin/env python """ -Update the Unicode code tables for wcwidth. This is code generation using jinja2. +Update the Unicode code tables for wcwidth. -This is typically executed through tox, +This is code generation using jinja2. This is typically executed through tox, $ tox -e update https://github.com/jquast/wcwidth """ + from __future__ import annotations # std imports @@ -24,7 +25,7 @@ from pathlib import Path from dataclasses import field, fields, dataclass -from typing import Any, Mapping, Iterable, Iterator, Sequence, Collection +from typing import Any, Mapping, Iterable, Iterator, Optional, Sequence, Collection try: from typing import Self @@ -108,9 +109,10 @@ def _bisearch(ucs, table): @dataclass(order=True, frozen=True) class UnicodeVersion: """A class for comparable unicode version.""" + major: int minor: int - micro: int | None + micro: Optional[int] @classmethod def parse(cls, version_str: str) -> UnicodeVersion: @@ -138,7 +140,8 @@ def __str__(self) -> str: @dataclass(frozen=True) class TableEntry: """An entry of a unicode table.""" - code_range: tuple[int, int] | None + + code_range: Optional[tuple[int, int]] properties: tuple[str, ...] comment: str @@ -255,6 +258,7 @@ class UnicodeTableRenderCtx(RenderContext): @dataclass class RenderDefinition: """Base class, do not instantiate it directly.""" + jinja_filename: str output_filename: str render_context: RenderContext @@ -330,6 +334,7 @@ def new(cls, filename: str, context: UnicodeTableRenderCtx) -> Self: @dataclass(frozen=True) class GraphemeTableRenderCtx(RenderContext): """Render context for grapheme tables (latest version only).""" + unicode_version: str tables: Mapping[str, TableDef] @@ -880,7 +885,6 @@ def fetch_table_grapheme_data() -> GraphemeTableRenderCtx: tables.update(parse_indic_syllabic_category( UnicodeDataFile.IndicSyllabicCategory(latest_version) )) - return GraphemeTableRenderCtx(str(latest_version), tables) @@ -895,6 +899,7 @@ class UnicodeDataFile: TestEmojiVariationSequences, these files should be forcefully re-fetched CLI argument '--no- check-last-modified'. """ + URL_DERIVED_AGE = 'https://www.unicode.org/Public/UCD/latest/ucd/DerivedAge.txt' URL_EASTASIAN_WIDTH = 'https://www.unicode.org/Public/{version}/ucd/EastAsianWidth.txt' URL_DERIVED_CATEGORY = 'https://www.unicode.org/Public/{version}/ucd/extracted/DerivedGeneralCategory.txt' diff --git a/bin/verify-table-integrity.py b/bin/verify-table-integrity.py index 8a567cd2..9a10a3bf 100644 --- a/bin/verify-table-integrity.py +++ b/bin/verify-table-integrity.py @@ -65,6 +65,7 @@ +DerivedGeneralCategory-8.0.0.txt:19B0..19C9 ; Lo # [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 """ + # std imports import logging diff --git a/bin/wcwidth-browser.py b/bin/wcwidth-browser.py index 4a439a64..aa033e22 100755 --- a/bin/wcwidth-browser.py +++ b/bin/wcwidth-browser.py @@ -376,6 +376,7 @@ def page_size(self): class Pager: """A less(1)-like browser for browsing unicode characters.""" + # pylint: disable=too-many-instance-attributes #: screen state for next draw method(s). @@ -520,14 +521,13 @@ def run(self, writer, reader): """ Pager entry point. - In interactive mode (terminal is a tty), run until - ``process_keystroke()`` detects quit keystroke ('q'). In - non-interactive mode, exit after displaying all unicode points. + In interactive mode (terminal is a tty), run until ``process_keystroke()`` detects quit + keystroke ('q'). In non-interactive mode, exit after displaying all unicode points. :param writer: callable writes to output stream, receiving unicode. :type writer: callable - :param reader: callable reads keystrokes from input stream, sending - instance of blessed.keyboard.Keystroke. + :param reader: callable reads keystrokes from input stream, sending instance of + blessed.keyboard.Keystroke. :type reader: callable """ self.initialize_page_data() @@ -720,8 +720,8 @@ def draw_heading(self, writer): """ Conditionally redraw screen when ``dirty`` attribute is valued REFRESH. - When Pager attribute ``dirty`` is ``STATE_REFRESH``, cursor is moved - to (0,0), screen is cleared, and heading is displayed. + When Pager attribute ``dirty`` is ``STATE_REFRESH``, cursor is moved to (0,0), screen is + cleared, and heading is displayed. :param callable writer: callable writes to output stream, receiving unicode. :return: True if class attribute ``dirty`` is ``STATE_REFRESH``. @@ -787,8 +787,8 @@ def page_view(self, data): """ Generator yields text to be displayed for the current unicode pageview. - :param list[(unicode, unicode)] data: The current page's data as tuple - of ``(ucs, name)``. + :param list[(unicode, unicode)] data: The current page's data as tuple of ``(ucs, + name)``. :returns: generator for full-page text for display """ if self.term.is_a_tty: diff --git a/bin/wcwidth-libc-comparator.py b/bin/wcwidth-libc-comparator.py index 82e6be43..691c2f19 100755 --- a/bin/wcwidth-libc-comparator.py +++ b/bin/wcwidth-libc-comparator.py @@ -13,6 +13,7 @@ This program accepts one optional command-line argument, the unicode version level for our library to use when comparing to libc. """ + # pylint: disable=C0103 # Invalid module name "wcwidth-libc-comparator" diff --git a/code_templates/grapheme_table.py.j2 b/code_templates/grapheme_table.py.j2 index 6596613f..424f0618 100644 --- a/code_templates/grapheme_table.py.j2 +++ b/code_templates/grapheme_table.py.j2 @@ -4,7 +4,7 @@ Exports grapheme cluster break property tables for Unicode version {{ unicode_ve This module provides lookup tables for Unicode grapheme cluster break properties as defined in UAX #29: Unicode Text Segmentation. -This code generated by {{this_filepath}} on {{utc_now}}. +This code generated by python wcwidth project. """ # pylint: disable=duplicate-code {%- for var_name, table_def in tables.items() %} diff --git a/code_templates/python_table.py.j2 b/code_templates/python_table.py.j2 index ec818c2b..4591025f 100644 --- a/code_templates/python_table.py.j2 +++ b/code_templates/python_table.py.j2 @@ -1,7 +1,7 @@ """ Exports {{ variable_name }} table keyed by supporting unicode version level. -This code generated by {{this_filepath}} on {{utc_now}}. +This code generated by python wcwidth project. """ # pylint: disable=duplicate-code {{ variable_name }} = { diff --git a/code_templates/unicode_versions.py.j2 b/code_templates/unicode_versions.py.j2 index 7bd53c27..998323bd 100644 --- a/code_templates/unicode_versions.py.j2 +++ b/code_templates/unicode_versions.py.j2 @@ -1,7 +1,7 @@ """ Exports function list_versions() for unicode version level support. -This code generated by {{this_filepath}} on {{utc_now}}. +This code generated by python wcwidth project. """ from __future__ import annotations diff --git a/docs/api.rst b/docs/api.rst index 55d288b4..901b019d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -36,4 +36,8 @@ requirements.txt or equivalent. Their signatures will never change. .. autofunction:: wcwidth.list_versions +.. autofunction:: wcwidth.Hyperlink + +.. autofunction:: wcwidth.HyperlinkParams + .. _SEMVER: https://semver.org diff --git a/docs/intro.rst b/docs/intro.rst index fec7bab6..687409e7 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -1,3 +1,4 @@ + |pypi_downloads| |codecov| |license| ============ @@ -35,33 +36,42 @@ Some examples of **incorrect results**: Solution -------- -The lowest-level functions in this library are the POSIX.1-2001 and POSIX.1-2008 `wcwidth(3)`_ and -`wcswidth(3)`_, which this library precisely copies by interface as `wcwidth()`_ and `wcswidth()`_. -These functions return -1 when C0 and C1 control codes are present. +The lowest-level functions in this library are derived from POSIX.1-2001 and POSIX.1-2008 +`wcwidth(3)`_ and `wcswidth(3)`_, which this library precisely copies by interface as `wcwidth()`_ +and `wcswidth()`_. These functions return -1 when C0 and C1 control codes are present. An easy-to-use `width()`_ function is provided as a wrapper of `wcswidth()`_ that is also capable of measuring most terminal control codes and sequences, like colors, bold, tabstops, and horizontal cursor movement. -Text-justification is solved by the grapheme and sequence-aware functions `ljust()`_, -`rjust()`_, `center()`_, and `wrap()`_, serving as drop-in replacements to python standard functions -of the same names. +Text-justification is solved by the sequence-aware functions `ljust()`_, `rjust()`_, `center()`_, +and the grapheme-aware function `wrap()`_, serving as drop-in replacements to python standard +functions. + +The `clip()`_ function extracts substrings by their displayed column positions, and +`strip_sequences()`_ removes terminal escape sequences from text altogether. The iterator functions `iter_graphemes()`_ and `iter_sequences()`_ allow for careful navigation of -grapheme and terminal control sequence boundaries. `iter_graphemes_reverse()`_, and -`grapheme_boundary_before()`_ are useful for editing and searching of complex unicode. The -`clip()`_ function extracts substrings by display column positions, and `strip_sequences()`_ removes -terminal escape sequences from text altogether. +grapheme and terminal control sequence boundaries as required by editors or REPLs with cursor +control. `iter_graphemes_reverse()`_, and `grapheme_boundary_before()`_ are often necessary for +backward cursor control over complex unicode. Discrepancies ------------- -You may find that support *varies* for complex unicode sequences or codepoints. +You may find that support *varies* for complex unicode sequences or codepoints. This library may be +considered to presume the terminal is enabled for DEC Private Mode 2027 ("Grapheme Clustering"), but +the specification does not fully describe varying unicode versions, feature levels, or details of +specific language support. This library does *not* support any alternate "legacy width" +measurement. -A companion utility, `jquast/ucs-detect`_ was authored to gather and publish the results of Wide -character, language/grapheme clustering and complex script support, emojis and zero-width joiner, -variations, and regional indicator (flags) as a `General Tabulated Summary`_ by terminal emulator -software and version. +See `Grapheme Clusters and Terminal Emulators`_ and `terminal-unicode-core.tex`_, and `State of +Terminal Emulators in 2025`_ for more details on Mode 2027 and unicode-aware terminals. + +The `jquast/ucs-detect`_ utility is used to gather and publish the results of compliance to our +standard for Wide character, Languages, grapheme clustering, complex or combining scripts, emojis, +zero-width joiner, variations, and regional indicator (flags) as a `General +Tabulated Summary`_ by terminal emulator software and version. ======== Overview @@ -118,30 +128,56 @@ Use function `width()`_ to measure a string with improved handling of ``control_ >>> # same support as wcswidth(), eg. regional indicator flag: >>> wcwidth.width('\U0001F1FF\U0001F1FC') 2 - >>> # but also supports SGR colored text, 'WARN', followed by SGR reset + >>> # but also supports sequences, like SGR colored text, "WARN", followed by reset >>> wcwidth.width('\x1b[38;2;255;150;100mWARN\x1b[0m') 4 - >>> # tabs, + >>> # tabs are measured as though the string begins at a tabstop, >>> wcwidth.width('\t', tabsize=4) 4 - >>> # or, tab and all other control characters can be ignored - >>> wcwidth.width('\t', control_codes='ignore') - 0 - >>> # "vertical" control characters are ignored - >>> wcwidth.width('\n') + >>> # or, all control characters can be ignored (including tab) + >>> wcwidth.width('\t\n\a\r', control_codes='ignore') 0 - >>> # as well as sequences with "indeterminate" effects like Home + Clear + >>> # sequences with "indeterminate" effects like Home + Clear are zero-width >>> wcwidth.width('\x1b[H\x1b[2J') 0 + >>> # horizontal cursor movements are parsed, + >>> wcwidth.width('hello\b\b\b\b\bworld') + 5 + >>> wcwidth.width('hello\x1b[5Dworld') + 5 + >>> # or ignored, + >>> wcwidth.width('hello\x1b[5Dworld', control_codes='ignore') + 10 + +Use ``control_codes='ignore'`` when the input is known not to contain any control characters or +terminal sequences for slightly improved performance. Note that TAB (``'\t'``) is a control +character and is also ignored, you may want to use `str.expandtabs()`_, first. + +Use ``control_codes='strict'`` when input is known to contain some control sequences, such as +SGR color, bold, hyperlinks and cursor movement. Any sequence that cannot be accurately parsed, +such as clearing the screen, vertical, or absolute cursor movement will raise ``ValueError``: + +.. code-block:: python + >>> # or, raise ValueError for "indeterminate" effects using control_codes='strict' >>> wcwidth.width('\n', control_codes='strict') Traceback (most recent call last): ... ValueError: Vertical movement character 0xa at position 0 -Use ``control_codes='ignore'`` when the input is known not to contain any control characters or -terminal sequences for slightly improved performance. Note that TAB (``'\t'``) is a control -character and is also ignored, you may want to use `str.expandtabs()`_, first. + + >>> wcwidth.width('\x1b[H\x1b[2J', control_codes='strict') + Traceback (most recent call last): + ... + ValueError: Indeterminate cursor sequence at position 0, '\x1b[H' + + + >>> # cursor left movement beyond string start raises in strict mode, + >>> wcwidth.width('a\x1b[5Da', control_codes='strict') + Traceback (most recent call last): + ... + ValueError: Cursor left movement at position 1 would move 5 cells left from column 1, exceeding string start + iter_sequences() ---------------- @@ -260,9 +296,25 @@ Use `clip()`_ to extract a substring by column positions, preserving terminal se >>> clip('\x1b[1;31mHello world\x1b[0m', 6, 11) '\x1b[1;31mworld\x1b[0m' - >>> # Disable SGR propagation to preserve original sequences as-is - >>> clip('\x1b[31m中文\x1b[0m', 0, 3, propagate_sgr=False) - '\x1b[31m中 \x1b[0m' + >>> # Disable SGR propagation to preserve sequence order outside of clip boundary + >>> clip('\x1b[31m中文\x1b[32m', 0, 3, propagate_sgr=False) + '\x1b[31m中 \x1b[32m' + + >>> # Cursor-left overwrites previous text (painter's algorithm) + >>> clip('hello\x1b[2DXY', 0, 5) + 'helXY' + >>> # Carriage return resets to column 0, overwriting earlier cells + >>> clip('abc\rXY', 0, 5) + 'XYc' + + >>> # even OSC 8 hyperlink text may be clipped, 'Click This link' -> 'is link' ! + >>> clip('\x1b]8;;http://example.com\x07Click This link\x1b]8;;\x07', 8, 15) + '\x1b]8;;http://example.com\x07is link\x1b]8;;\x07' + +Use ``overtyping=False`` when the input is known not to contain any cursor movement characters +(``\b``, ``\r``, ``CSI C``, ``CSI D``, ``CSI G``) for improved performance. When +``overtyping=None`` (default), a slower "Painter's algorithm" may be used after testing for the +presence of these characters. ``overtyping`` has no effect when ``control_codes='ignore'``. strip_sequences() ----------------- @@ -448,6 +500,10 @@ Other Languages There are similar implementations of the `wcwidth()`_ and `wcswidth()`_ functions in other languages. +- `contour-terminal/libunicode`_: C++20 +- `ridiculousfish/widecharwidth`_: Python +- `termux/wcwidth`_: C +- `powerman/wcwidth-icons`_: C - `timoxley/wcwidth`_: JavaScript - `janlelis/unicode-display_width`_: Ruby - `alecrabbit/php-wcwidth`_: PHP @@ -457,6 +513,9 @@ languages. - `grepsuzette/wcwidth`_: Haxe - `aperezdc/lua-wcwidth`_: Lua - `joachimschmidt557/zig-wcwidth`_: Zig +- `mycoboco/wcwidth.js`_: JavaScript +- `ainame/swift-displaywidth`_: Swift +- `pmonks/clj-wcwidth`_: Clojure - `fumiyas/wcwidth-cjk`_: `LD_PRELOAD` override - `joshuarubin/wcwidth9`_: Unicode version 9 in C - `spectreconsole/wcwidth`_: C# @@ -465,6 +524,14 @@ languages. History ======= +0.7.0 *2026-04-30* + * **New** `clip()`_ parameter ``control_codes='parse'``, ``'ignore'``, and ``'strict'``. `clip()`_ + is now able to clip OSC 8 hyperlinks. + * **Improved** `clip()`_ and `width()`_ to support horizontal cursor sequences (``cub``, ``cuf``, + ``hpa``). Cursor-left (``cub``) or backspace (``\b``) now overwrites text. ``column_address`` + (``hpa``) and carriage return (``\r``) are now parsed, and some values conditionally raise + ``ValueError`` when ``control_codes='parse'``. + 0.6.0 *2026-02-06* * **New** Parameters ``expand_tabs``, ``replace_whitespace``, ``fix_sentence_endings``, ``drop_whitespace``, ``max_lines``, and ``placeholder`` for `wrap()`_, completing stdlib @@ -715,6 +782,13 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c:: .. _`fumiyas/wcwidth-cjk`: https://github.com/fumiyas/wcwidth-cjk .. _`joshuarubin/wcwidth9`: https://github.com/joshuarubin/wcwidth9 .. _`spectreconsole/wcwidth`: https://github.com/spectreconsole/wcwidth +.. _`contour-terminal/libunicode`: https://github.com/contour-terminal/libunicode +.. _`ridiculousfish/widecharwidth`: https://github.com/ridiculousfish/widecharwidth +.. _`termux/wcwidth`: https://github.com/termux/wcwidth +.. _`powerman/wcwidth-icons`: https://github.com/powerman/wcwidth-icons +.. _`mycoboco/wcwidth.js`: https://github.com/mycoboco/wcwidth.js +.. _`ainame/swift-displaywidth`: https://github.com/ainame/swift-displaywidth +.. _`pmonks/clj-wcwidth`: https://github.com/pmonks/clj-wcwidth .. _`python-cmd2/cmd2`: https://github.com/python-cmd2/cmd2 .. _`stratis-storage/stratis-cli`: https://github.com/stratis-storage/stratis-cli .. _`ihabunek/toot`: https://github.com/ihabunek/toot @@ -748,6 +822,10 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c:: .. _`Unicode Standard Annex #29`: https://www.unicode.org/reports/tr29/ .. _`Terminal.detect_ambiguous_width()`: https://blessed.readthedocs.io/en/latest/api/terminal.html#blessed.terminal.Terminal.detect_ambiguous_width .. _`parity padding`: https://jazcap53.github.io/pythons-eccentric-strcenter.html +.. _`kitty text sizing protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ +.. _`Grapheme Clusters and Terminal Emulators`: https://mitchellh.com/writing/grapheme-clusters-in-terminals +.. _`terminal-unicode-core.tex`: https://github.com/contour-terminal/terminal-unicode-core/blob/master/spec/terminal-unicode-core.tex +.. _`State of Terminal Emulators in 2025`: https://www.jeffquast.com/post/state-of-terminal-emulation-2025/ .. |pypi_downloads| image:: https://img.shields.io/pypi/dm/wcwidth.svg?logo=pypi :alt: Downloads :target: https://pypi.org/project/wcwidth/ diff --git a/docs/requirements.txt b/docs/requirements.txt index eb4f0317..c2376147 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,35 +1,34 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.14 # by the following command: # # pip-compile --allow-unsafe --no-emit-index-url --output-file=docs/requirements.txt --strip-extras requirements-docs.in # - alabaster==1.0.0 # via sphinx babel==2.18.0 # via sphinx -certifi==2026.1.4 +certifi==2026.4.22 # via requests -charset-normalizer==3.4.4 +charset-normalizer==3.4.7 # via requests docutils==0.22.4 # via # sphinx # sphinx-rtd-theme -idna==3.11 +idna==3.13 # via requests -imagesize==1.4.1 +imagesize==2.0.0 # via sphinx jinja2==3.1.6 # via sphinx markupsafe==3.0.3 # via jinja2 -packaging==26.0 +packaging==26.2 # via sphinx pygments==2.20.0 # via sphinx -requests==2.33.0 +requests==2.33.1 # via sphinx roman-numerals==4.1.0 # via sphinx @@ -41,7 +40,7 @@ sphinx==9.1.0 # sphinx-autodoc-typehints # sphinx-rtd-theme # sphinxcontrib-jquery -sphinx-autodoc-typehints==3.6.2 +sphinx-autodoc-typehints==3.10.2 # via -r requirements-docs.in sphinx-rtd-theme==3.1.0 # via -r requirements-docs.in diff --git a/docs/specs.rst b/docs/specs.rst index ba1d8856..1182b38e 100644 --- a/docs/specs.rst +++ b/docs/specs.rst @@ -6,9 +6,23 @@ Specification This document defines how this Python wcwidth library measures the printable width of characters of a string. This is not meant to an official standard, but as a terse description of the lowest level -API functions :func:`wcwidth.wcwidth` and :func:`wcwidth.wcswidth`. +API functions :func:`wcwidth.wcwidth` and :func:`wcwidth.wcswidth` and its relation to higher level +functions :func:`wcwidth.width` and :func:`wcwidth.iter_graphemes`. -The :func:`wcwidth.iter_graphemes` function is mainly specified by `Unicode Standard Annex #29`_. +Scope +----- + +The lowest level functions :func:`wcwidth.wcwidth` and :func:`wcwidth.wcswidth` return -1 when any +control codes are present. The higher level function :func:`wcwidth.width` never returns -1, +accepting default arguments, ``control_codes='parse'`` and its behavior and options are described by +its docstring and specifications of related control codes, `XTerm Control Sequences`_ and `Kitty +Text Sizing Protocol`_. + +Each string yielded by :func:`wcwidth.iter_graphemes` may be mapped to :func:`wcwidth.wcswidth` to +accurately measure the width of a **grapheme**. Although :func:`wcwidth.iter_graphemes` matches +behavior of Python 3.15 `uncodedata.iter_graphemes()`_ it differs in its return value, +:func:`wcwidth.iter_graphemes` yields only strings, while :func:`unicodedata.iter_graphemes` yields +``unicodedata.Segment`` class objects. Width of -1 ----------- @@ -85,15 +99,16 @@ an emoji base, they combine with the base and add 0 to total width. Any characters of `Modifier Symbol`_ category, ``'Sk'`` where ``'FULLWIDTH'`` is present in comment of `UnicodeData.txt`_, aprox. 3 characters. -Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by -`emoji-variation-sequences.txt`_ as ``emoji style``. +Any character with `U+FE0F`_ (Variation Selector 16) defined as ``emoji style`` +in `emoji-variation-sequences.txt`_: VS16 adds 1 cell to the narrow character +it directly follows, making the pair width 2. Wide characters are unchanged. Any character of non-zero width followed by an ``Mc`` (`Spacing Combining Mark`_) character when measured in sequence by :func:`wcwidth.wcswidth` or :func:`wcwidth.width`. The ``Mc`` character adds +1 to the total width, reflecting its *positive advance width* as defined in `General Category`_ (Table 4-4). Zero-width combining marks (``Mn``) between the base character -and the ``Mc`` do not break the association — for example, a consonant followed +and the ``Mc`` do not break the association. For example, a consonant followed by a Nukta (``Mn``) and then a vowel sign (``Mc``) is measured as base + 1. Virama Conjunct Formation @@ -101,13 +116,13 @@ Virama Conjunct Formation In `Brahmic scripts`_, a `Virama`_ (``Indic_Syllabic_Category=Virama`` in `IndicSyllabicCategory.txt`_) between two consonants triggers `conjunct`_ -formation: the font engine merges the consonants into a single ligature glyph. +formation: the consonants are merged into a single ligature glyph. - A ``Consonant`` immediately following a ``Virama`` contributes 0 width. -- The conjunct still occupies cells — the next visible advance settles it: +- The conjunct still occupies cells and the next visible advance settles it: - A following ``Mc`` (`Spacing Combining Mark`_, e.g. a vowel sign) counts as - 1 cell and closes the conjunct — no extra cell is added. + 1 cell and closes the conjunct. - A following character with positive width (or end of string) adds 1 cell for the conjunct before counting its own width. @@ -119,6 +134,9 @@ formation: the font engine merges the consonants into a single ligature glyph. See also: `L2/2023/23107`_ "Proper Complex Script Support in Text Terminals". +.. _`Hyperlinks in Terminal Emulators`: https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda +.. _`Kitty Text Sizing Protocol`: https://sw.kovidgoyal.net/kitty/text-sizing-protocol/ +.. _`XTerm Control Sequences`: https://invisible-island.net/xterm/ctlseqs/ctlseqs.html .. _`U+0000`: https://codepoints.net/U+0000 .. _`U+0001`: https://codepoints.net/U+0001 .. _`U+001F`: https://codepoints.net/U+001F @@ -164,3 +182,4 @@ See also: `L2/2023/23107`_ "Proper Complex Script Support in Text Terminals". .. _`aksara`: https://www.unicode.org/glossary/#aksara .. _`L2/2023/23107`: https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf .. _`Unicode Standard Annex #29`: https://www.unicode.org/reports/tr29/ +.. _`uncodedata.iter_graphemes()`: https://docs.python.org/3.15/library/unicodedata.html#unicodedata.iter_graphemes diff --git a/docs/unicode_version.rst b/docs/unicode_version.rst index 41a1e520..38ff78df 100644 --- a/docs/unicode_version.rst +++ b/docs/unicode_version.rst @@ -16,6 +16,21 @@ release files: ``emoji-variation-sequences-12.0.0.txt`` *Date: 2019-01-15, 12:10:05 GMT* +``emoji-variation-sequences-13.0.0.txt`` + *Date: 2020-01-21, 07:15:05 GMT* + +``emoji-variation-sequences-14.0.0.txt`` + *Date: 2021-06-08, 05:19:16 GMT* + +``emoji-variation-sequences-15.0.0.txt`` + *Date: 2022-05-13, 21:54:24 GMT* + +``emoji-variation-sequences-15.1.0.txt`` + *Date: 2023-02-01, 02:22:54 GMT* + +``emoji-variation-sequences-16.0.0.txt`` + *Date: 2024-05-01, 21:25:24 GMT* + ``emoji-variation-sequences-17.0.0.txt`` *Date: 2025-01-30, 21:48:29 GMT* diff --git a/pyproject.toml b/pyproject.toml index 0fed636b..5b049046 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "hatchling" ] [project] name = "wcwidth" -version = "0.6.0" +version = "0.7.0" # don't forget to also update wcwidth/__init__.py:__version__ description = "Measures the displayed width of unicode strings in a terminal" readme = "README.rst" keywords = [ diff --git a/requirements-tests38.in b/requirements-tests38.in index 19efdeb9..ea2794e6 100644 --- a/requirements-tests38.in +++ b/requirements-tests38.in @@ -1,6 +1,5 @@ # for python3.8 pytest<7 pytest-cov -pytest-xdist coverage[toml]<6 packaging<26 diff --git a/requirements-tests38.txt b/requirements-tests38.txt index 0b9d25e1..2b8a2692 100644 --- a/requirements-tests38.txt +++ b/requirements-tests38.txt @@ -10,8 +10,6 @@ coverage==5.5 # via # -r requirements-tests38.in # pytest-cov -execnet==2.1.2 - # via pytest-xdist iniconfig==2.1.0 # via pytest packaging==25.0 @@ -26,11 +24,8 @@ pytest==6.2.5 # via # -r requirements-tests38.in # pytest-cov - # pytest-xdist pytest-cov==5.0.0 # via -r requirements-tests38.in -pytest-xdist==3.5.0 - # via -r requirements-tests38.in toml==0.10.2 # via # coverage diff --git a/requirements-tests39.in b/requirements-tests39.in index 8c7d45f5..2a141825 100644 --- a/requirements-tests39.in +++ b/requirements-tests39.in @@ -1,9 +1,10 @@ # For Python 3.9 *and newer* pytest>=7.4.2 pytest-cov>=4.1.0 -pytest-xdist pytest-codspeed importlib-metadata<8.7.1 packaging<26.0 tomli<2.3.0 cffi<2 +pygments<2.20 +zipp<3.23.1 diff --git a/requirements-tests39.txt b/requirements-tests39.txt index 65682e50..0fe17a51 100644 --- a/requirements-tests39.txt +++ b/requirements-tests39.txt @@ -4,7 +4,6 @@ # # pip-compile --allow-unsafe --no-emit-index-url --output-file=requirements-tests39.txt --strip-extras requirements-tests39.in # - cffi==1.17.1 # via # -r requirements-tests39.in @@ -13,8 +12,6 @@ coverage==7.10.7 # via pytest-cov exceptiongroup==1.3.1 # via pytest -execnet==2.1.2 - # via pytest-xdist importlib-metadata==8.7.0 # via # -r requirements-tests39.in @@ -37,6 +34,7 @@ pycparser==2.23 # via cffi pygments==2.19.2 # via + # -r requirements-tests39.in # pytest # rich pytest==8.4.2 @@ -44,14 +42,11 @@ pytest==8.4.2 # -r requirements-tests39.in # pytest-codspeed # pytest-cov - # pytest-xdist -pytest-codspeed==4.2.0 - # via -r requirements-tests39.in -pytest-cov==7.0.0 +pytest-codspeed==4.5.0 # via -r requirements-tests39.in -pytest-xdist==3.8.0 +pytest-cov==7.1.0 # via -r requirements-tests39.in -rich==14.3.1 +rich==15.0.0 # via pytest-codspeed tomli==2.2.1 # via @@ -61,4 +56,6 @@ tomli==2.2.1 typing-extensions==4.15.0 # via exceptiongroup zipp==3.23.0 - # via importlib-metadata + # via + # -r requirements-tests39.in + # importlib-metadata diff --git a/requirements-update.txt b/requirements-update.txt index 92830ab4..209b3f49 100644 --- a/requirements-update.txt +++ b/requirements-update.txt @@ -1,14 +1,14 @@ # -# This file is autogenerated by pip-compile with Python 3.13 +# This file is autogenerated by pip-compile with Python 3.14 # by the following command: # # pip-compile --allow-unsafe --no-emit-index-url --output-file=requirements-update.txt --strip-extras requirements-update.in # -certifi==2026.2.25 +certifi==2026.4.22 # via requests charset-normalizer==3.4.7 # via requests -idna==3.11 +idna==3.13 # via requests jinja2==3.1.6 # via -r requirements-update.in diff --git a/tests/conftest.py b/tests/conftest.py index 2d0a2779..ecbbdc87 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ """Pytest configuration and fixtures.""" + # 3rd party import pytest @@ -10,6 +11,7 @@ @pytest.fixture def benchmark(): """No-op benchmark fixture for environments without pytest-codspeed.""" + def _passthrough(func, *args, **kwargs): return func(*args, **kwargs) return _passthrough diff --git a/tests/test_ambiguous.py b/tests/test_ambiguous.py index 0c61cdac..20ed4d7d 100644 --- a/tests/test_ambiguous.py +++ b/tests/test_ambiguous.py @@ -1,4 +1,5 @@ """Tests for ambiguous_width parameter.""" + # 3rd party import pytest diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index b85448eb..e6d3b44a 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -1,4 +1,5 @@ """Performance benchmarks for wcwidth module.""" + # std imports import os import sys @@ -10,7 +11,7 @@ # local import wcwidth -_wcwidth_module = sys.modules['wcwidth.wcwidth'] +_width_module = sys.modules['wcwidth._width'] def test_wcwidth_ascii(benchmark): @@ -292,6 +293,137 @@ def test_clip_complex_sgr(benchmark): benchmark(wcwidth.clip, text, 6, 11) +def test_clip_long_cjk_past_window(benchmark): + """Benchmark clip() with long CJK text, narrow window (early-exit path).""" + text = '中文测试字符串' * 100 # 700 chars, no escape sequences + benchmark(wcwidth.clip, text, 0, 50) + + +def test_clip_dense_ansi_past_window(benchmark): + """Benchmark clip() with dense ANSI sequences past clip window (SGR tracking).""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30) + + +def test_clip_dense_ansi_no_propagate(benchmark): + """Benchmark clip() with dense ANSI sequences, SGR propagation disabled.""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, propagate_sgr=False) + + +def test_clip_osc8_hyperlinks(benchmark): + """Benchmark clip() with OSC 8 hyperlinks (hyperlink parsing path).""" + text = '\x1b]8;;http://example.com\x07Click Here\x1b]8;;\x07 ' * 20 + benchmark(wcwidth.clip, text, 0, 80) + + +def test_clip_cursor_cr_overwrite(benchmark): + """Benchmark clip() with carriage-return overwrite (painter path).""" + text = 'hello\rworld ' * 20 + benchmark(wcwidth.clip, text, 0, 50) + + +def test_clip_cursor_csi_backward(benchmark): + """Benchmark clip() with CSI cursor-backward sequences (painter path).""" + text = 'hello\x1b[2Dxy ' * 20 + benchmark(wcwidth.clip, text, 0, 40) + + +def test_clip_long_ascii_fastpath(benchmark): + """Benchmark clip() with long ASCII string (fast-path slice).""" + text = 'hello world ' * 1000 + benchmark(wcwidth.clip, text, 500, 600) + + +def test_clip_with_ansi_no_overtype(benchmark): + """Benchmark clip() with ANSI sequences, overtyping disabled.""" + text = '\x1b[31m中文字\x1b[0m' + benchmark(wcwidth.clip, text, 0, 3, overtyping=False) + + +def test_clip_complex_sgr_no_overtype(benchmark): + """Benchmark clip() with complex SGR, overtyping disabled.""" + text = '\x1b[1;38;5;208mHello world text\x1b[0m' + benchmark(wcwidth.clip, text, 6, 11, overtyping=False) + + +def test_clip_dense_ansi_no_overtype(benchmark): + """Benchmark clip() with dense ANSI, overtyping disabled.""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, overtyping=False) + + +def test_clip_dense_ansi_no_propagate_no_overtype(benchmark): + """Benchmark clip() with dense ANSI, SGR propagation and overtyping disabled.""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, propagate_sgr=False, overtyping=False) + + +def test_clip_dense_ansi_overtype(benchmark): + """Benchmark clip() with dense ANSI, overtyping forced (painter path).""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, overtyping=True) + + +def test_clip_long_cjk_overtype(benchmark): + """Benchmark clip() with long CJK, overtyping forced (painter path).""" + text = '中文测试字符串' * 100 + benchmark(wcwidth.clip, text, 0, 50, overtyping=True) + + +def test_width_dense_ansi_control_codes_ignore(benchmark): + """Benchmark width() with dense ANSI and control_codes='ignore'.""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.width, text, control_codes='ignore') + + +def test_width_complex_ansi_control_codes_ignore(benchmark): + """Benchmark width() with complex ANSI and control_codes='ignore'.""" + text = '\x1b[38;2;255;150;100mWARN\x1b[0m: \x1b[1mBold\x1b[0m \x1b[4mUnderline\x1b[0m' + benchmark(wcwidth.width, text, control_codes='ignore') + + +def test_clip_dense_ansi_control_codes_ignore(benchmark): + """Benchmark clip() with dense ANSI, control_codes='ignore' (skips painter/OSC).""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, control_codes='ignore') + + +def test_clip_long_cjk_control_codes_ignore(benchmark): + """Benchmark clip() with long CJK and control_codes='ignore' (early-exit path).""" + text = '中文测试字符串' * 100 + benchmark(wcwidth.clip, text, 0, 50, control_codes='ignore') + + +def test_clip_cursor_cr_control_codes_ignore(benchmark): + """Benchmark clip() with CR overwrite and control_codes='ignore' (painter skipped).""" + text = 'hello\rworld ' * 20 + benchmark(wcwidth.clip, text, 0, 50, control_codes='ignore') + + +def test_clip_dense_ansi_no_propagate_control_codes_ignore(benchmark): + """Benchmark clip() with dense ANSI, propagate_sgr=False and control_codes='ignore'.""" + text = '\x1b[31mred\x1b[0m \x1b[32mgreen\x1b[0m \x1b[33myellow\x1b[0m ' * 50 + benchmark(wcwidth.clip, text, 6, 30, propagate_sgr=False, control_codes='ignore') + + +def test_clip_long_ascii_control_codes_ignore(benchmark): + """Benchmark clip() with long ASCII and control_codes='ignore' (fast-path slice).""" + text = 'hello world ' * 1000 + benchmark(wcwidth.clip, text, 500, 600, control_codes='ignore') + + +def test_wrap_with_ansi_control_codes_ignore(benchmark): + """Benchmark wrap() with ANSI sequences and control_codes='ignore'.""" + text = '\x1b[31mThe quick brown fox jumps over the lazy dog.\x1b[0m Did it really? ' * 20 + benchmark(wcwidth.wrap, text, 40, control_codes='ignore') + + +def test_ljust_ascii_control_codes_ignore(benchmark): + """Benchmark ljust() with ASCII and control_codes='ignore'.""" + benchmark(wcwidth.ljust, 'hello', 20, control_codes='ignore') + + def test_propagate_sgr_multiline(benchmark): """Benchmark propagate_sgr() with multiple lines.""" lines = ['\x1b[1;31mline one', 'line two', 'line three\x1b[0m'] @@ -327,7 +459,7 @@ def test_iter_sequences_mixed(benchmark): benchmark(lambda: list(wcwidth.iter_sequences(text))) -# Brahmic script benchmarks — text with virama conjuncts +# Brahmic script benchmarks -- text with virama conjuncts BRAHMIC_DEVANAGARI = 'हिन्दी भाषा में लिखा गया पाठ है। क्षत्रिय स्त्री ' * 20 BRAHMIC_BENGALI = 'বাংলা ভাষায় লেখা একটি পাঠ। বাঙ্গালী ভাষা ' * 20 @@ -373,33 +505,48 @@ def test_width_brahmic_bengali(benchmark): reason=f"{os.path.basename(UDHR_FILE)} is missing; run bin/update-tables.py", ) +_py38_skip_pedantic = pytest.mark.skipif( + sys.version_info[:2] < (3, 9), + reason='benchmark.pedantic() not supported in python 3.8 or earlier') + @_udhr_skip +@_py38_skip_pedantic def test_wrap_udhr(benchmark): """Benchmark wrap() with multilingual UDHR text.""" + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') result = benchmark.pedantic(wcwidth.wrap, args=(UDHR_TEXT, 80), rounds=1, iterations=1) assert len(result) assert all(0 <= wcwidth.width(_l) <= 80 for _l in result) @_udhr_skip +@_py38_skip_pedantic def test_width_udhr(benchmark): """Benchmark width() with multilingual UDHR text.""" + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') result = benchmark.pedantic(wcwidth.width, args=(UDHR_TEXT,), rounds=1, iterations=1) assert result > 0 @_udhr_skip +@_py38_skip_pedantic def test_width_udhr_lines(benchmark): """Benchmark width() on individual UDHR lines.""" + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') result = benchmark.pedantic(lambda: sum(wcwidth.width(line) for line in UDHR_LINES), rounds=1, iterations=1) assert result > 0 @_udhr_skip +@_py38_skip_pedantic def test_width_wcswidth_consistency_udhr(benchmark): """Verify width() and wcswidth() agree for printable multilingual text.""" + def check(): failures = [] for line in UDHR_LINES: @@ -410,30 +557,38 @@ def check(): if w != wcs: failures.append((line[:60], w, wcs)) return failures + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') failures = benchmark.pedantic(check, rounds=1, iterations=1) assert not failures @_udhr_skip +@_py38_skip_pedantic def test_width_fastpath_integrity_udhr(benchmark): """Verify width() produces identical results with and without the fast path.""" - saved = _wcwidth_module._WIDTH_FAST_PATH_MIN_LEN + saved = _width_module._WIDTH_FAST_PATH_MIN_LEN def check(): - _wcwidth_module._WIDTH_FAST_PATH_MIN_LEN = 0 + _width_module._WIDTH_FAST_PATH_MIN_LEN = 0 fast_total = sum(wcwidth.width(line) for line in UDHR_LINES) - _wcwidth_module._WIDTH_FAST_PATH_MIN_LEN = 999_999 + _width_module._WIDTH_FAST_PATH_MIN_LEN = 999_999 parse_total = sum(wcwidth.width(line) for line in UDHR_LINES) return fast_total, parse_total + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') fast_total, parse_total = benchmark.pedantic(check, rounds=1, iterations=1) - _wcwidth_module._WIDTH_FAST_PATH_MIN_LEN = saved + _width_module._WIDTH_FAST_PATH_MIN_LEN = saved assert fast_total == parse_total @_udhr_skip +@_py38_skip_pedantic def test_ljust_udhr_lines(benchmark): """Benchmark ljust() on UDHR lines.""" + if not hasattr(benchmark, 'pedantic'): + pytest.skip('pytest-codspeed not installed') benchmark.pedantic(lambda: [wcwidth.ljust(line, w + 1, UDHR_FILLCHAR) for line, w in zip(UDHR_LINES, UDHR_WIDTHS)], rounds=1, iterations=1) diff --git a/tests/test_clip.py b/tests/test_clip.py index 5be64307..8ab3f1d2 100644 --- a/tests/test_clip.py +++ b/tests/test_clip.py @@ -1,4 +1,5 @@ """Tests for clip() and strip_sequences() functions.""" + # 3rd party import pytest @@ -114,26 +115,162 @@ def test_clip_sequences_after_end(): # With propagate_sgr=True (default), no style active at start, so no prefix assert clip('hello\x1b[31m world\x1b[0m', 0, 5) == 'hello' # With propagate_sgr=False, all sequences preserved - assert clip('hello\x1b[31m world\x1b[0m', 0, 5, propagate_sgr=False) == 'hello\x1b[31m\x1b[0m' + assert repr(clip('hello\x1b[31m world\x1b[0m', 0, 5, propagate_sgr=False)) == repr('hello\x1b[31m\x1b[0m') def test_clip_sequences_multiple(): # With propagate_sgr=True (default), sequences collapsed to minimal assert clip('\x1b[1m\x1b[31mbold red\x1b[0m', 0, 4) == '\x1b[1;31mbold\x1b[0m' # With propagate_sgr=False, all sequences preserved separately - assert clip('\x1b[1m\x1b[31mbold red\x1b[0m', 0, 4, propagate_sgr=False) == '\x1b[1m\x1b[31mbold\x1b[0m' + assert repr(clip('\x1b[1m\x1b[31mbold red\x1b[0m', 0, 4, propagate_sgr=False)) == repr('\x1b[1m\x1b[31mbold\x1b[0m') def test_clip_sequences_only(): # With propagate_sgr=True (default), no visible text means empty result assert clip('\x1b[31m\x1b[0m', 0, 10) == '' # With propagate_sgr=False, sequences preserved - assert clip('\x1b[31m\x1b[0m', 0, 10, propagate_sgr=False) == '\x1b[31m\x1b[0m' + assert repr(clip('\x1b[31m\x1b[0m', 0, 10, propagate_sgr=False)) == repr('\x1b[31m\x1b[0m') def test_clip_sequences_osc_hyperlink(): - assert clip('\x1b]8;;https://example.com\x07link\x1b]8;;\x07', 0, 4) == \ + assert repr(clip('\x1b]8;;https://example.com\x07link\x1b]8;;\x07', 0, 4)) == repr( '\x1b]8;;https://example.com\x07link\x1b]8;;\x07' + ) + + +# OSC 8 hyperlink clipping + +OSC_START_BEL = '\x1b]8;;http://example.com\x07' +OSC_END_BEL = '\x1b]8;;\x07' +OSC_START_ST = '\x1b]8;;http://example.com\x1b\\' +OSC_END_ST = '\x1b]8;;\x1b\\' + + +CLIP_HYPERLINK_CASES = [ + # Full hyperlink visible -- preserved as-is + (f'{OSC_START_BEL}link{OSC_END_BEL}', 0, 4, + f'{OSC_START_BEL}link{OSC_END_BEL}'), + # Clipping middle of hyperlink text -- rebuild around clipped inner text + (f'{OSC_START_BEL}Click This link{OSC_END_BEL}', 6, 10, + f'{OSC_START_BEL}This{OSC_END_BEL}'), + # Clipping from start -- only first portion + (f'{OSC_START_BEL}Click This{OSC_END_BEL}', 0, 5, + f'{OSC_START_BEL}Click{OSC_END_BEL}'), + # Clipping from end -- only last portion + (f'{OSC_START_BEL}Click This{OSC_END_BEL}', 6, 10, + f'{OSC_START_BEL}This{OSC_END_BEL}'), + # Hyperlink entirely before clip window -- dropped + (f'{OSC_START_BEL}link{OSC_END_BEL}world', 0, 4, + f'{OSC_START_BEL}link{OSC_END_BEL}'), + # Hyperlink entirely after clip window -- dropped + (f'hello{OSC_START_BEL}link{OSC_END_BEL}', 0, 5, 'hello'), + # Hyperlink clipped to nothing -- empty hyperlink dropped + (f'{OSC_START_BEL}link{OSC_END_BEL}', 5, 10, ''), + # Empty hyperlink (no inner text) -- dropped + (f'before{OSC_START_BEL}{OSC_END_BEL}after', 0, 11, 'beforeafter'), + # Hyperlink with CJK text clipped + (f'{OSC_START_BEL}中文文字{OSC_END_BEL}', 0, 4, + f'{OSC_START_BEL}中文{OSC_END_BEL}'), + # Hyperlink with CJK text clipped at odd column + (f'{OSC_START_BEL}中文文字{OSC_END_BEL}', 0, 3, + f'{OSC_START_BEL}中 {OSC_END_BEL}'), + # Hyperlink with ST terminator + (f'{OSC_START_ST}Click This{OSC_END_ST}', 0, 5, + f'{OSC_START_ST}Click{OSC_END_ST}'), + # Multiple non-overlapping hyperlinks + (f'{OSC_START_BEL}ab{OSC_END_BEL} {OSC_START_ST}cd{OSC_END_ST}', 0, 5, + f'{OSC_START_BEL}ab{OSC_END_BEL} {OSC_START_ST}cd{OSC_END_ST}'), + # Hyperlink with params preserved + ('\x1b]8;id=myid;http://example.com\x07link\x1b]8;;\x07', 1, 3, + '\x1b]8;id=myid;http://example.com\x07in\x1b]8;;\x07'), + # Hyperlink text before clip window, hyperlink within + (f'before{OSC_START_BEL}link{OSC_END_BEL}', 6, 10, + f'{OSC_START_BEL}link{OSC_END_BEL}'), + # SGR inside hyperlink is preserved + (f'{OSC_START_BEL}\x1b[31mred link\x1b[0m{OSC_END_BEL}', 4, 8, + f'{OSC_START_BEL}\x1b[31mlink\x1b[0m{OSC_END_BEL}'), + # Hyperlink open without matching close -- preserved as regular sequence + ('\x1b]8;;http://example.com\x07link', 0, 4, '\x1b]8;;http://example.com\x07link'), + # Bare ESC between hyperlink markers + ('\x1b]8;;url\x07ab\x1bxcd\x1b]8;;\x07', 0, 6, + '\x1b]8;;url\x07ab\x1bxcd\x1b]8;;\x07'), + # Per OSC 8 spec "A note on opening/closing hyperlinks": terminal + # emulators treat hyperlinks as a state attribute, not nested anchors. + # Opening a new hyperlink replaces the current one; a single close + # terminates the hyperlink regardless of how many opens preceded it. + # + # Two opens, one close: URL "b" replaces "a", close terminates. + ('\x1b]8;;a\x07AB\x1b]8;;b\x07CD\x1b]8;;\x07EF', 0, 6, + '\x1b]8;;a\x07AB\x1b]8;;b\x07CD\x1b]8;;\x07EF'), + # URL switch without closing: "b" replaces "a", no close in input. + ('\x1b]8;;a\x07AB\x1b]8;;b\x07CD', 0, 4, + '\x1b]8;;a\x07AB\x1b]8;;b\x07CD'), + # Multiple opens, close, bare close: "b" replaces "a", first close + # terminates, trailing close is harmless (closing when not open). + ('\x1b]8;;a\x07ABCD \x1b]8;;b\x07XY\x1b]8;;\x07 EF\x1b]8;;\x07', 0, 10, + '\x1b]8;;a\x07ABCD \x1b]8;;b\x07XY\x1b]8;;\x07 EF\x1b]8;;\x07'), +] + + +@pytest.mark.parametrize('text,start,end,expected', CLIP_HYPERLINK_CASES) +def test_clip_osc_hyperlink_text_clipping(text, start, end, expected): + """OSC 8 hyperlink inner text is clipped and hyperlink rebuilt.""" + assert repr(clip(text, start, end)) == repr(expected) + + +# Control_codes variants with cursor movement into hyperlink +# +# Overwriting hyperlink cells causes corrupted "run on" hyperlinks in practical +# testing with kitty, presumably the hidden "end hyperlink" sequence is +# overwritten, in any case, we make no attempt to parse overwrite of +# hyperlinks, we consider it a "glitch sequence +_HLINK_OVERWRITE = f'{OSC_START_BEL}link{OSC_END_BEL}\x1b[2Dxy' +CLIP_HYPERLINK_CONTROL_CODES_CASES = [ + ('parse', 0, 4, f'{OSC_START_BEL}link{OSC_END_BEL}'), + ('parse', 0, 3, f'{OSC_START_BEL}lin{OSC_END_BEL}'), + ('parse', 0, 2, f'{OSC_START_BEL}li{OSC_END_BEL}'), + ('parse', 0, 1, f'{OSC_START_BEL}l{OSC_END_BEL}'), + # these next two are certainly "in error" + ('parse', 1, 4, f'{OSC_START_BEL}ink{OSC_END_BEL}y'), + ('parse', 1, 3, f'{OSC_START_BEL}in{OSC_END_BEL}x'), + ('parse', 1, 2, f'{OSC_START_BEL}i{OSC_END_BEL}'), + ('ignore', 0, 20, f'{_HLINK_OVERWRITE}'), + # and these two, 'xy' are missing entirely, also "in error" + ('parse', 0, 20, f'{OSC_START_BEL}link{OSC_END_BEL}'), + ('strict', 0, 20, f'{OSC_START_BEL}link{OSC_END_BEL}'), +] + + +@pytest.mark.parametrize('control_codes,start,end,expected', + CLIP_HYPERLINK_CONTROL_CODES_CASES) +def test_clip_hyperlink_control_codes_overwrite(control_codes, start, end, expected): + assert repr(clip(_HLINK_OVERWRITE, start, end, control_codes=control_codes)) == repr(expected) + + +# Painter-path hyperlink edge cases +CLIP_HYPERLINK_PAINTER_CASES = [ + # Empty hyperlink dropped + (f'\x1b[2D{OSC_START_BEL}{OSC_END_BEL}xy', 'parse', 0, 4, 'xy'), + # Hyperlink entirely after clip window -- skipped + (f'\x1b[2Dab{OSC_START_BEL}cde{OSC_END_BEL}', 'parse', 0, 2, 'ab'), + # Hyperlink entirely before clip window -- skipped + (f'{OSC_START_BEL}ab{OSC_END_BEL}\x1b[2Dcdef', 'parse', 2, 4, 'ef'), + # Hyperlink overlapping clip window -- clipped + (f'\x1b[2D{OSC_START_BEL}abcdef{OSC_END_BEL}', 'parse', 0, 3, + f'{OSC_START_BEL}abc{OSC_END_BEL}'), + # Bare ESC inside hyperlink in painter path + (f'\x1b[2D{OSC_START_BEL}a\x1bb{OSC_END_BEL}', 'parse', 0, 4, + f'{OSC_START_BEL}a\x1bb{OSC_END_BEL}'), + # strict mode: non-hyperlink cells don't overlap hyperlink_cells + (f'{OSC_START_BEL}link{OSC_END_BEL}\x1b[5Chi', 'strict', 0, 11, + f'{OSC_START_BEL}link{OSC_END_BEL} hi'), +] + + +@pytest.mark.parametrize('text,control_codes,start,end,expected', + CLIP_HYPERLINK_PAINTER_CASES) +def test_clip_hyperlink_painter_cases(text, control_codes, start, end, expected): + assert repr(clip(text, start, end, control_codes=control_codes)) == repr(expected) def test_clip_sequences_cjk_with_sequences(): @@ -230,11 +367,11 @@ def test_clip_tab_with_sequences(): CLIP_CONTROL_CHAR_CASES = [ - ('abc\bde', 0, 5, 'abc\bde'), - ('ab\acd', 0, 4, 'ab\acd'), + ('abc\bde', 0, 5, 'abde'), + ('ab\acd', 0, 4, 'ab\x07cd'), ('ab\x00cd', 0, 4, 'ab\x00cd'), - ('abc\rde', 0, 5, 'abc\rde'), - ('\a\b\rHello', 0, 5, '\a\b\rHello'), + ('abc\rde', 0, 5, 'dec'), + ('\a\b\rHello', 0, 5, '\x07Hello'), ('ab\x01\x02cd', 0, 4, 'ab\x01\x02cd'), ('ab\x1b\x00cd', 0, 4, 'ab\x1b\x00cd'), ] @@ -245,19 +382,73 @@ def test_clip_control_chars_zero_width(text, start, end, expected): assert clip(text, start, end) == expected -CLIP_CURSOR_SEQUENCE_CASES = [ - ('ab\x1b[5Ccd', 0, 4, 'ab\x1b[5Ccd'), - ('abcde\x1b[2Df', 0, 6, 'abcde\x1b[2Df'), - ('ab\x1b[10Ccd', 0, 4, 'ab\x1b[10Ccd'), - ('ab\x1b[Ccd', 0, 4, 'ab\x1b[Ccd'), +def test_clip_tab_first_visible_with_sgr(): + """Tab as first visible character with SGR propagation.""" + assert clip('\x1b[31m\tb', 0, 4, tabsize=8) == '\x1b[31m \x1b[0m' + + +def test_clip_overtyping_override_by_control_codes_ignore(): + """When overtyping=True and control_codes='ignore', overtyping is overridden to False.""" + # elif entered: overtyping=True + control_codes='ignore' → overtyping=False + assert clip('hello world', 0, 5, overtyping=True, control_codes='ignore') == 'hello' + # Verify that overtyping is actually disabled: cursor movement chars are + # treated as zero-width, so the result is the same as without overtyping. + assert clip('ab\x08cd', 0, 4, overtyping=True, control_codes='ignore') == 'ab\x08cd' + + +def test_clip_overtyping_without_ignore(): + """When overtyping=True and control_codes='parse', elif is not entered.""" + # elif skipped: overtyping=True + control_codes='parse' → overtyping stays True + # The painter path is used, cursor movement sequences affect output. + assert clip('ab\x1b[2Dcd', 0, 4, overtyping=True, control_codes='parse') == 'cd' + + +# Indeterminate-effect sequences that raise ValueError in strict mode +# (matching width() behavior). These are not cursor-movement sequences, +# so they exercise the simple (non-overtyping) path. + +INDETERMINATE_SEQUENCES = [ + ('\x1b[K', 'erase_in_line'), + ('\x1b[2K', 'erase_in_line_params'), + ('\x1b[J', 'erase_in_display'), + ('\x1b[2J', 'erase_in_display_params'), + ('\x1b[H', 'cursor_home'), + ('\x1b[1;1H', 'cursor_address'), + ('\x1b[A', 'cursor_up'), + ('\x1b[2A', 'cursor_up_params'), + ('\x1b[B', 'cursor_down'), + ('\x1b[5B', 'cursor_down_params'), + ('\x1b[P', 'delete_character'), + ('\x1b[1P', 'parm_dch'), + ('\x1b[M', 'delete_line'), + ('\x1b[1M', 'parm_delete_line'), + ('\x1b[L', 'insert_line'), + ('\x1b[1L', 'parm_insert_line'), + ('\x1b[@', 'insert_character'), + ('\x1b[1X', 'erase_chars'), + ('\x1b[S', 'scroll_up'), + ('\x1b[T', 'scroll_down'), + ('\x1b[?1049h', 'enter_fullscreen'), + ('\x1b[?1049l', 'exit_fullscreen'), + ('\x1bD', 'scroll_forward'), + ('\x1bM', 'scroll_reverse'), + ('\x1b8', 'restore_cursor'), + ('\x1bc', 'full_reset'), ] -@pytest.mark.parametrize('text,start,end,expected', CLIP_CURSOR_SEQUENCE_CASES) -def test_clip_cursor_sequences_zero_width(text, start, end, expected): - assert clip(text, start, end) == expected +@pytest.mark.parametrize('seq,cap_name', INDETERMINATE_SEQUENCES) +def test_clip_strict_indeterminate_raises(seq, cap_name): + """Clip() strict mode raises ValueError on indeterminate-effect sequences.""" + with pytest.raises(ValueError, match='Indeterminate cursor sequence'): + clip(f'hello{seq}world', 0, 10, control_codes='strict') -def test_clip_tab_first_visible_with_sgr(): - """Tab as first visible character with SGR propagation.""" - assert clip('\x1b[31m\tb', 0, 4, tabsize=8) == '\x1b[31m \x1b[0m' +@pytest.mark.parametrize('seq,cap_name', INDETERMINATE_SEQUENCES) +def test_clip_parse_indeterminate_preserved(seq, cap_name): + """Clip() parse mode preserves indeterminate sequences as zero-width.""" + result = clip(f'hello{seq}world', 0, 10, control_codes='parse') + # The sequence is preserved, visible text is hello + world = 10 chars + assert 'hello' in result + assert 'world' in result + assert seq in result diff --git a/tests/test_clip_cjk_emoji.py b/tests/test_clip_cjk_emoji.py new file mode 100644 index 00000000..e41bd627 --- /dev/null +++ b/tests/test_clip_cjk_emoji.py @@ -0,0 +1,47 @@ +""" +Tests for clip() with CJK and Emoji characters. + +These ensure wide graphemes (CJK / emoji / ZWJ sequences) are clipped correctly: +- Partial columns of a wide grapheme are replaced by fillchar. +- Full grapheme included when fully inside slice. +""" + +# 3rd party +import pytest + +# local +from wcwidth import clip, width + + +@pytest.mark.parametrize("ch", [ + "中", + "🙂", + "👨\u200d👩\u200d👧", # family ZWJ + "👩\u200d👩\u200d👧" # another ZWJ variant +]) +def test_partial_and_full_wide_grapheme(ch): + w = width(ch) + assert w >= 1 + if w > 1: + # partial clip of first column -> fillchar + assert clip(ch, 0, 1) == ' ' + # full clip covering entire grapheme -> original grapheme + assert clip(ch, 0, w) == ch + # width of clipped full grapheme should match + assert width(clip(ch, 0, w)) == w + else: + # narrow grapheme: trivial + assert clip(ch, 0, 1) == ch + + +def test_mixed_cjk_emoji_sequence(): + text = 'A中🙂B' + total_w = width(text) + # sanity + assert total_w >= 4 + # pick a slice that includes the middle two columns (center of string) + # ensure clip doesn't raise and width matches requested slice + start = 1 + end = 4 + out = clip(text, start, end) + assert width(out) == (end - start) diff --git a/tests/test_clip_overtyping.py b/tests/test_clip_overtyping.py new file mode 100644 index 00000000..1d106bae --- /dev/null +++ b/tests/test_clip_overtyping.py @@ -0,0 +1,159 @@ +""" +Tests for clip()'s overtyping (painter) path. + +The painter algorithm is used when the text contains cursor movement sequences +(CSI n C/D, backspace, carriage return, HPA) that require column-level tracking +to determine the final visible output. Auto-detection of the overtyping path +happens in clip() via the presence of \\x08, \\r, or horizontal cursor movement +escape sequences, or can be forced with ``overtyping=True``. + +These tests codify expected visible results when cursor movement sequences +affect horizontal positions. +""" + +# 3rd party +import pytest + +# local +from wcwidth import clip + + +@pytest.mark.parametrize("text,start,end,kwargs,expected", [ + # Cursor-right introduces a gap that should be filled with spaces + ("hello\x1b[10Cworld", 0, 10, {}, "hello" + " " * 5), + # Clipping just the initial region ignores the later rightward write + ("hello\x1b[10Cworld", 0, 5, {}, "hello"), + # Cursor-left overwrites previous characters + ("hello\x1b[2DXY", 0, 5, {}, "helXY"), + # Cursor-left overwrites entire visible token + ("abc\x1b[3DXY", 0, 5, {}, "XYc"), + # Cursor-left at column 0 (prev_col not > col, no overwrite) + ("\x1b[2Dhi", 0, 2, {}, "hi"), + # Cursor-left with no visible tokens emitted + ("\x1b[5C\x1b[2Dhi", 5, 7, {}, ""), + # Cursor-left overwrites text, seq tokens preserve column spatial order + ("ab\x1b]8;;http://example.com\x07\x1b[2Dcd", 0, 4, {}, "cd\x1b]8;;http://example.com\x07"), + # Cursor-left into wide char twice, second time on empty token triggers i < 0 break + ("中\x1b[D\x1b[Da", 0, 4, {}, "a "), + ('ab\x1b[5Ccd', 0, 4, {}, 'ab '), + ('abcde\x1b[2Df', 0, 6, {}, 'abcfe'), + ('hello\x1b[5Dw', 0, 5, {}, 'wello'), + ('ab\x1b[10Ccd', 0, 4, {}, 'ab '), + ('XY\x1b[Czy', 0, 4, {}, 'XY z'), + ('XY\x1b[Czy', 0, 5, {}, 'XY zy'), + ('XY\x1b[Czy', 1, 3, {}, 'Y '), + ('XY\x1b[Czy', 1, 4, {}, 'Y z'), + ('LOL\x1b[5Clol', 0, 12, {}, 'LOL lol'), + ('LOL\x1b[5Clol', 1, 11, {}, 'OL lol'), + ('LOL\x1b[5Clol', 2, 11, {}, 'L lol'), + ('LOL\x1b[5Clol', 3, 11, {}, ' lol'), + ('LOL\x1b[5Clol', 4, 11, {}, ' lol'), + ('LOL\x1b[5Clol', 5, 11, {}, ' lol'), + ('LOL\x1b[5Clol', 6, 11, {}, ' lol'), + ('LOL\x1b[5Clol', 7, 11, {}, ' lol'), + ('LOL\x1b[5Clol', 8, 11, {}, 'lol'), + ('LOL\x1b[5Clol', 9, 11, {}, 'ol'), + # SGR + cursor movement: SGR state update in painter path (line 245) + ('\x1b[31mab\x1b[2Dcd', 0, 4, {}, '\x1b[31mcd\x1b[0m'), + # Tab tabsize=0 in painter path (line 272->280 else branch) + ('ab\x1b[2D\tcd', 0, 4, {'tabsize': 0}, '\tcd'), + # Zero-width grapheme outside clip window in painter (line 290->301) + ('\x1b[2D\u0301hello', 1, 4, {}, 'ell'), + # Wide char partially clipped in painter (lines 298-299) + ('ab\x1b[2D中d', 1, 4, {}, ' d'), + # walk_col >= end in painter reconstruction (327->328) + ('hello\x1b[2Dxy', 0, 3, {}, 'hel'), + # Hole fillchar in painter reconstruction (345->346) + ('\x1b[5Chi', 0, 7, {}, ' hi'), + # Trailing sequences stored at columns after col_limit (352, 354->355, 355->356) + ('abc\x1b[2D', 0, 2, {}, 'ab'), + # Bare ESC not part of any sequence, pass through in painter path (239->240) + ('a\x1bb\x1b[2Dc', 0, 3, {}, 'c\x1bb'), + # Tab with tabsize>0 in painter; `b` falls at col 4, inside (0,5) (277->284, 278->279, 278->280) + ('\x1b[2Da\tb', 0, 5, {'tabsize': 4}, 'a b'), + # propagate_sgr=False in painter path (225->226) + ('ab\x1b[2Dcd', 0, 4, {'propagate_sgr': False}, 'cd'), + # Non-SGR sequence before any visible text in painter (225->226 True) + ('\x1b]8;;http://example.com\x07ab\x1b[2Dcd', 0, 4, {}, '\x1b]8;;http://example.com\x07cd'), + # Bare ESC at end of text in painter (239->240) + ('ab\x1b[2D\x1b', 0, 2, {}, '\x1bab'), + # Wide char overwritten from right side (212 orphan fixup) + ('a中\x1b[Db', 0, 4, {}, 'a b'), + # Tab expansion with col+=1 not inside clip window (277->279, 293) + ('\x1b[2Ca\tb', 2, 4, {'tabsize': 8}, 'a '), + # CR: carriage return resets column to 0, overwriting earlier cells + ('aaa\r\r\rxxx', 0, 4, {}, 'xxx'), + ('abc\rXY', 0, 5, {}, 'XYc'), + ('hello\rworld', 0, 5, {}, 'world'), + # CR moves back to column 0 then writes within clip window + ('abc\rde', 1, 3, {}, 'ec'), + # BS: backspace overwrites previous character + ('abc\bde', 0, 5, {}, 'abde'), + ('abc\b\bXY', 0, 5, {}, 'aXY'), + ('ab\b\b\bXY', 0, 4, {}, 'XY'), + # HPA: horizontal position absolute (CSI n G) + ('abc\x1b[GXY', 0, 5, {}, 'XYc'), + ('abc\x1b[2GXY', 0, 5, {}, 'aXY'), + ('abc\x1b[5GXY', 0, 7, {}, 'abc XY'), + ('abc\x1b[5GXY', 0, 5, {}, 'abc X'), + ('\x1b[5GXY', 3, 7, {}, ' XY'), + # HPA no-param inside clip window + ('abc\x1b[GXY', 1, 4, {}, 'Yc'), + # walk_col >= end with sequences at column == end (line 351) + ('\x1b[5C\x1b]8;;http://example.com\x07', 0, 5, {'propagate_sgr': False}, ' \x1b]8;;http://example.com\x07'), + # Trailing sequences past col_limit (line 374) + ('\x1b[5C\x1b]8;;http://example.com\x07', 0, 3, {'propagate_sgr': False}, ' \x1b]8;;http://example.com\x07'), + # Lone ESC as first visible thing in painter (captured_style = current_style, line 398) + ('\x1b[D\x1b\x1bXy', 0, 3, {}, '\x1b\x1bXy'), + # Hyperlink VISIBLE after captured_style already set + ('a\x1b[C\x1b]8;;http://x\x07hi\x1b]8;;\x07', 0, 5, {}, 'a \x1b]8;;http://x\x07hi\x1b]8;;\x07'), + # Tab with tabsize=0 as first visible thing in painter + ('\x1b[D\tab', 0, 2, {'tabsize': 0}, '\tab'), + # Zero-width grapheme as first visible thing in painter + ('\x1b[D\u0301x', 0, 3, {}, '\u0301x'), + # Generic escape sequence as first visible in painter + ('\x1b[D\x1b[Hxy', 0, 3, {}, '\x1b[Hxy'), +]) +def test_clip_cursor_sequences_expected_behaviour(text, start, end, kwargs, expected): + """Verify clip() output matches terminal-visible columns after cursor moves.""" + result = clip(text, start, end, **kwargs) + assert repr(result) == repr(expected) + + +def test_clip_cursor_left_strict_out_of_bounds(): + """Clip() with control_codes='strict' raises on cursor-left beyond string start.""" + with pytest.raises(ValueError, match='Cursor left movement'): + clip('a\x1b[5Da', 0, 1, control_codes='strict') + + +def test_clip_cursor_left_strict_out_of_bounds_painter(): + """Clip() strict-mode raises on cursor-left beyond start in painter path.""" + with pytest.raises(ValueError, match='Cursor left movement'): + clip('\x1b[2Dab', 0, 2, control_codes='strict') + + +def test_clip_cursor_left_out_of_bounds_parse_no_raise(): + """Clip() parse mode silently clamps cursor-left beyond start.""" + assert clip('a\x1b[5Da', 0, 1) == 'a' + assert clip('ab\x1b[99Dcd', 0, 4) == 'cd' + + +def test_clip_strict_cr_allowed(): + """Carriage return is allowed in strict mode (text begins at column 0).""" + assert clip('hello\rworld', 0, 5, control_codes='strict') == 'world' + + +def test_clip_strict_hpa_allowed(): + """HPA is allowed in strict mode (text begins at column 0).""" + assert clip('abc\x1b[5Gde', 0, 10, control_codes='strict') == 'abc de' + + +def test_clip_strict_cursor_left_allowed(): + """Cursor-left within bounds is allowed in strict mode.""" + assert clip('hello\x1b[2Dxy', 0, 5, control_codes='strict') == 'helxy' + + +def test_clip_strict_indeterminate_sequence_painter(): + """Clip() strict-mode raises on indeterminate sequence in painter path.""" + with pytest.raises(ValueError, match='Indeterminate cursor sequence'): + clip('a\x1b[D\x1b[Hb', 0, 3, control_codes='strict') diff --git a/tests/test_core.py b/tests/test_core.py index 024dcdba..dd1e3b7d 100755 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,6 +1,5 @@ """Core tests for wcwidth module.""" # std imports -import sys import importlib.metadata # 3rd party @@ -8,9 +7,7 @@ # local import wcwidth - -_wcwidth_module = sys.modules['wcwidth.wcwidth'] -_WIDTH_FAST_PATH_MIN_LEN = _wcwidth_module._WIDTH_FAST_PATH_MIN_LEN +from wcwidth._width import _WIDTH_FAST_PATH_MIN_LEN def test_package_version(): @@ -68,9 +65,8 @@ def test_hello_jp(): """ Width of Japanese phrase: コンニチハ, セカイ! - Given a phrase of 5 and 3 Katakana ideographs, joined with - 3 English-ASCII punctuation characters, totaling 11, this - phrase consumes 19 cells of a terminal emulator. + Given a phrase of 5 and 3 Katakana ideographs, joined with 3 English-ASCII punctuation + characters, totaling 11, this phrase consumes 19 cells of a terminal emulator. """ # given, phrase = 'コンニチハ, セカイ!' @@ -90,8 +86,7 @@ def test_wcswidth_substr(): """ Test wcswidth() optional 2nd parameter, ``n``. - ``n`` determines at which position of the string - to stop counting length. + ``n`` determines at which position of the string to stop counting length. """ # given, phrase = 'コンニチハ, セカイ!' @@ -414,13 +409,8 @@ def test_bengali_nukta_mc(): @pytest.mark.parametrize("repeat", [1, _WIDTH_FAST_PATH_MIN_LEN]) def test_mc_width_consistency(repeat): - # width(), wcswidth(), and per-grapheme width sums must all agree. - # - # The repeat parameter ensures both the short (parse) and long (fast) code - # paths of width() are exercised. At repeat=1 the phrases are short enough - # to go through character-by-character parse mode. At repeat=_WIDTH_FAST_PATH_MIN_LEN - # every phrase exceeds the threshold and takes the fast path that delegates - # to wcswidth(). + """Check width() to wcswidth() consistency.""" + # repeat value 'WIDTH_FAST_PATH_MIN_LEN' ensures both "fast" and "slow" paths are taken phrases = [ "\u0915\u094D\u0937\u093F", "\u0b95\u0bcd\u0bb7\u0bcc", @@ -464,6 +454,11 @@ def test_virama_conjunct(phrase, expected): assert wcwidth.width(phrase) == expected +def test_zwj_at_end_of_string(): + """ZWJ at end of string (not after virama) is consumed with zero width.""" + assert wcwidth.wcswidth('a\u200D') == 1 + + def test_soft_hyphen(): # Test SOFT HYPHEN, category 'Cf' usually are zero-width, but most # implementations agree to draw it was '1' cell, visually @@ -493,3 +488,24 @@ def test_prepended_concatenation_mark_width(codepoint, name): """Prepended Concatenation Marks have width 1, not 0.""" # https://github.com/jquast/wcwidth/issues/119 assert wcwidth.wcwidth(chr(codepoint)) == 1 + + +def test_legacy_module(): + """Verify legacy ``wcwidth.wcwidth`` module's public items are importable.""" + # pylint: disable=import-outside-toplevel + # std imports + import sys + + # Access the legacy submodule via sys.modules (matching 0.6.0 where + # 'import wcwidth.wcwidth' returned the function, not the module). + _legacy = sys.modules['wcwidth.wcwidth'] + + for name in _legacy.__all__: + attr = getattr(_legacy, name) + assert attr is not None, f"wcwidth.wcwidth.{name} is None" + + # Verify that individual imports from the legacy path also work, + # e.g. 'from wcwidth.wcwidth import wcswidth' + for name in _legacy.__all__: + obj = getattr(_legacy, name) + assert obj is not None, f"could not import {name} from wcwidth.wcwidth" diff --git a/tests/test_emojis.py b/tests/test_emojis.py index 9a962cc2..f49c9ef6 100644 --- a/tests/test_emojis.py +++ b/tests/test_emojis.py @@ -1,4 +1,5 @@ """Tests for emoji width measurement and ZWJ sequences.""" + # std imports import os diff --git a/tests/test_grapheme.py b/tests/test_grapheme.py index fb987234..8d139b99 100644 --- a/tests/test_grapheme.py +++ b/tests/test_grapheme.py @@ -1,4 +1,5 @@ """Tests for grapheme cluster segmentation.""" + # std imports import os diff --git a/tests/test_hyperlink.py b/tests/test_hyperlink.py new file mode 100644 index 00000000..7b083a82 --- /dev/null +++ b/tests/test_hyperlink.py @@ -0,0 +1,75 @@ +"""Tests for OSC 8 hyperlink parsing.""" + +# 3rd party +import pytest + +# local +from wcwidth.hyperlink import Hyperlink, HyperlinkParams + +PARAMS_PARSE_VALID = [ + ('\x1b]8;;http://example.com\x07', 'http://example.com', '', '\x07'), + ('\x1b]8;id=a;http://example.com\x1b\\', 'http://example.com', 'id=a', '\x1b\\'), +] + + +@pytest.mark.parametrize('seq,url,params,term', PARAMS_PARSE_VALID) +def test_hyperlinkparams_parse_valid(seq, url, params, term): + """Parse a valid OSC 8 open sequence.""" + result = HyperlinkParams.parse(seq) + assert result is not None + assert result.url == url + assert result.params == params + assert result.terminator == term + + +@pytest.mark.parametrize('seq', [ + 'not an escape', + '\x1b[31m', + '', +]) +def test_hyperlinkparams_parse_invalid(seq): + """Parse an invalid/non-OSC-8 sequence returns None.""" + assert HyperlinkParams.parse(seq) is None + + +def test_hyperlinkparams_make_open(): + assert HyperlinkParams(url='http://example.com', params='id=a', terminator='\x07').make_open() == '\x1b]8;id=a;http://example.com\x07' + + +def test_hyperlinkparams_make_close(): + assert HyperlinkParams(url='http://example.com', terminator='\x07').make_close() == '\x1b]8;;\x07' + + +_HL = '\x1b]8;;http://example.com\x07Hello\x1b]8;;\x07' + + +def test_hyperlink_parse_valid(): + hl = Hyperlink.parse(_HL) + assert hl is not None + assert hl.text == 'Hello' + assert hl.params.url == 'http://example.com' + + +@pytest.mark.parametrize('text,start', [ + ('Hello world', 0), + ('\x1b[31mHello\x1b[0m', 0), # SGR, not OSC 8 + ('\x1b]8;;http://example.com\x07Hello', 0), # open without close +]) +def test_hyperlink_parse_returns_none(text, start): + assert Hyperlink.parse(text, start) is None + + +def test_hyperlink_find_close_not_found(): + assert Hyperlink.find_close('no escape here', 0) == (-1, -1) + + +def test_hyperlink_make_sequence(): + hl = Hyperlink.parse(_HL) + assert hl is not None + assert hl.make_sequence() == _HL + + +def test_hyperlink_display_width(): + hl = Hyperlink.parse(_HL) + assert hl is not None + assert hl.display_width() == 5 diff --git a/tests/test_justify.py b/tests/test_justify.py index 71dec619..f2639e8c 100644 --- a/tests/test_justify.py +++ b/tests/test_justify.py @@ -1,4 +1,5 @@ """Tests for text justification functions.""" + # local from wcwidth import ljust, rjust, width, center diff --git a/tests/test_sgr_state.py b/tests/test_sgr_state.py index db9c8a9c..ecba402f 100644 --- a/tests/test_sgr_state.py +++ b/tests/test_sgr_state.py @@ -1,4 +1,5 @@ """Tests for SGR state tracking and propagation.""" + from __future__ import annotations # std imports diff --git a/tests/test_textwrap.py b/tests/test_textwrap.py index e3e88070..33da72a4 100644 --- a/tests/test_textwrap.py +++ b/tests/test_textwrap.py @@ -1,4 +1,5 @@ """Tests for sequence-aware text wrapping functions.""" + # std imports import sys import platform @@ -76,17 +77,14 @@ def _colorize(text): ) -EDGE_CASES = [ +@pytest.mark.parametrize('text,w,expected', [ ('', 10, []), (' ', 10, []), ('\u5973', 0, ['\u5973']), ('\u5973', 1, ['\u5973']), (ZWJ_FAMILY, 1, [ZWJ_FAMILY]), (HANGUL_GA, 1, [HANGUL_GA]), -] - - -@pytest.mark.parametrize('text,w,expected', EDGE_CASES) +]) def test_wrap_edge_cases(text, w, expected): assert wrap(text, w) == expected @@ -95,28 +93,22 @@ def test_wrap_initial_indent(): assert wrap('hello world', 10, initial_indent='> ') == ['> hello', 'world'] -LONG_WORD_CASES = [ +@pytest.mark.parametrize('text,w,break_long,expected', [ ('abcdefghij', 3, True, ['abc', 'def', 'ghi', 'j']), ('abcdefghij', 3, False, ['abcdefghij']), -] - - -@pytest.mark.parametrize('text,w,break_long,expected', LONG_WORD_CASES) +]) def test_wrap_long_words(text, w, break_long, expected): assert wrap(text, w, break_long_words=break_long) == expected -HYPHEN_LONG_WORD_CASES = [ +@pytest.mark.parametrize('text,w,break_hyphens,propagate,expected', [ ('a-b-c-d', 3, True, True, ['a-', 'b-', 'c-d']), ('a-b-c-d', 3, False, True, ['a-b', '-c-', 'd']), ('---', 2, True, True, ['--', '-']), ('a---b', 2, True, True, ['a-', '--', 'b']), ('a-\x1b[31mb', 2, True, True, ['a-\x1b[31m\x1b[0m', '\x1b[31mb\x1b[0m']), ('a-\x1b[31mb', 2, True, False, ['a-\x1b[31m', 'b']), -] - - -@pytest.mark.parametrize('text,w,break_hyphens,propagate,expected', HYPHEN_LONG_WORD_CASES) +]) def test_wrap_hyphen_long_words(text, w, break_hyphens, propagate, expected): assert wrap(text, w, break_on_hyphens=break_hyphens, propagate_sgr=propagate) == expected @@ -182,7 +174,7 @@ def test_wrap_multiline_matches_stdlib(): assert wrap(given, 30) == textwrap.wrap(given, 30) -UNICODE_CASES = [ +@pytest.mark.parametrize('text,w,expected', [ # CJK (2 cells each) ('\u4e2d\u6587\u5b57\u7b26', 4, ['\u4e2d\u6587', '\u5b57\u7b26']), ('\u4e2d\u6587\u5b57', 5, ['\u4e2d\u6587', '\u5b57']), @@ -192,18 +184,14 @@ def test_wrap_multiline_matches_stdlib(): (f'{FAMILY_ZWJ} ab', 4, [FAMILY_ZWJ, 'ab']), (f'{SMILEY_VS16} ab', 3, [SMILEY_VS16, 'ab']), ('\U0001F469\U0001F467\U0001F466', 4, ['\U0001F469\U0001F467', '\U0001F466']), -] - - -@pytest.mark.parametrize('text,w,expected', UNICODE_CASES) +]) def test_wrap_unicode(benchmark, text, w, expected): kwargs = {'break_on_hyphens': False} if '-' in text else {} result = benchmark(wrap, text, w, **kwargs) assert result == expected -# Escape sequence preservation (with propagate_sgr=True default) -SEQUENCE_CASES = [ +@pytest.mark.parametrize('text,w,expected', [ # SGR sequences propagated across lines (f'{SGR_RED}red{SGR_RESET} blue', 4, [f'{SGR_RED}red{SGR_RESET}', 'blue']), # SGR at end of line propagates to next line @@ -223,41 +211,34 @@ def test_wrap_unicode(benchmark, text, w, expected): ['x\x1b[31mab\x1b[0m', '\x1b[31mcde\x1b[0m', '\x1b[31mfgh\x1b[0m', '\x1b[31mij\x1b[0m']), # Fs sequence (ESC d) - zero-width, stays with preceding text ('abc\x1bdefghij', 3, ['abc\x1bd', 'efg', 'hij']), -] - -SEQUENCE_CASES_NO_PROPAGATE = [ - (f'hello{SGR_RED} world', 6, [f'hello{SGR_RED}', 'world']), - ('x\x1b[31mabcdefghij\x1b[0m', 3, ['x\x1b[31mab', 'cde', 'fgh', 'ij\x1b[0m']), -] - - -@pytest.mark.parametrize('text,w,expected', SEQUENCE_CASES) +]) def test_wrap_sequences(benchmark, text, w, expected): + """Escape sequence preservation (with propagate_sgr=True default)""" assert benchmark(wrap, text, w) == expected -@pytest.mark.parametrize('text,w,expected', SEQUENCE_CASES_NO_PROPAGATE) +@pytest.mark.parametrize('text,w,expected', [ + (f'hello{SGR_RED} world', 6, [f'hello{SGR_RED}', 'world']), + ('x\x1b[31mabcdefghij\x1b[0m', 3, ['x\x1b[31mab', 'cde', 'fgh', 'ij\x1b[0m']), +] +) def test_wrap_sequences_no_propagate(text, w, expected): result = wrap(text, w, propagate_sgr=False) assert result == expected -# Mixed: sequences + unicode -MIXED_CASES = [ +@pytest.mark.parametrize('text,w,expected', [ (f'{SGR_RED}\u4e2d\u6587{SGR_RESET} ab', 5, [f'{SGR_RED}\u4e2d\u6587{SGR_RESET}', 'ab']), (f'{SGR_RED}{FAMILY_ZWJ}{SGR_RESET} ab', 4, [f'{SGR_RED}{FAMILY_ZWJ}{SGR_RESET}', 'ab']), (f'{SGR_BOLD}\u4e2d{SGR_RESET}y z', 4, [f'{SGR_BOLD}\u4e2d{SGR_RESET}y', 'z']), -] - - -@pytest.mark.parametrize('text,w,expected', MIXED_CASES) +]) def test_wrap_mixed(benchmark, text, w, expected): + """Test mixed sequences + unicode.""" result = benchmark(wrap, text, w) assert result == expected -# Tabsize with wide characters - tests column alignment with different cell widths -TABSIZE_WIDE_CASES = [ +@pytest.mark.parametrize('text,w,tabsize,expected', [ # CJK (2 cells) + tab: tabsize=4, '\u4e2d' is 2 cols, tab expands to col 4 ('\u4e2d\ta b', 6, 4, ['\u4e2d a', 'b']), # CJK + tab with tabsize=8: '\u4e2d' is 2 cols, tab expands to col 8 @@ -268,10 +249,7 @@ def test_wrap_mixed(benchmark, text, w, expected): ('\u4e2d\u6587\ta', 8, 4, ['\u4e2d\u6587 a']), # ASCII + tab + CJK: 'a' is 1 col, tab to 4 (3 spaces), CJK is 2 cols ('a\t\u4e2d b', 8, 4, ['a \u4e2d b']), -] - - -@pytest.mark.parametrize('text,w,tabsize,expected', TABSIZE_WIDE_CASES) +]) @pytest.mark.skipif( platform.python_implementation() == 'PyPy' and sys.version_info < (3, 9), reason='PyPy 3.8 str.expandtabs() counts UTF-8 bytes instead of characters' @@ -286,7 +264,8 @@ def test_wrap_tabsize_wide_chars(text, w, tabsize, expected): OSC_START_BEL = '\x1b]8;;http://example.com\x07' OSC_END_BEL = '\x1b]8;;\x07' -HYPERLINK_WORD_BOUNDARY_CASES = [ + +@pytest.mark.parametrize('text,w,expected', [ ( # standard, ST-variant, f'{OSC_START_ST}link{OSC_END_ST}more', 5, @@ -408,18 +387,14 @@ def test_wrap_tabsize_wide_chars(text, w, tabsize, expected): '\x1b]8;foo=bar:id=mylink;http://example.com\x1b\\Click\x1b]8;;\x1b\\', '\x1b]8;foo=bar:id=mylink;http://example.com\x1b\\here\x1b]8;;\x1b\\', ], - ), -] - - -@pytest.mark.parametrize('text,w,expected', HYPERLINK_WORD_BOUNDARY_CASES) + ),]) def test_wrap_hyperlink_word_boundary(text, w, expected): """OSC hyperlink sequences should act as word boundaries.""" result = wrap(text, w) assert result == expected -PLACEHOLDER_STDLIB_CASES = [ +@pytest.mark.parametrize('text,kwargs', [ ('The quick brown fox jumps over the lazy dog', {'width': 10, 'max_lines': 3, 'placeholder': '...'}), ('1234567890 1234567890 extra', @@ -444,10 +419,7 @@ def test_wrap_hyperlink_word_boundary(text, w, expected): {'width': 10, 'subsequent_indent': ' ', 'max_lines': 2, 'placeholder': '...'}), ('hello world foo bar', {'width': 10, 'initial_indent': '> ', 'max_lines': 2, 'placeholder': '...'}), -] - - -@pytest.mark.parametrize('text,kwargs', PLACEHOLDER_STDLIB_CASES) +]) def test_wrap_max_lines_matches_stdlib(text, kwargs): expected = _adjust_stdlib_result(textwrap.wrap(text, **kwargs), kwargs) assert wrap(text, **kwargs) == expected @@ -460,7 +432,7 @@ def test_wrap_placeholder_too_large(): textwrap.wrap('fox', width=1, max_lines=3, placeholder='...') -MAX_LINES_SEQUENCE_CASES = [ +@pytest.mark.parametrize('text,w,ml,ph,expected', [ (f'{SGR_RED}hello world foo bar{SGR_RESET}', 8, 2, '...', [f'{SGR_RED}hello{SGR_RESET}', f'{SGR_RED}world...{SGR_RESET}']), (f'{SGR_RED}hello{SGR_RESET} world foo', @@ -470,10 +442,7 @@ def test_wrap_placeholder_too_large(): ('\u4e2d\u6587 \u5b57\u7b26 hello', 5, 1, '~', ['\u4e2d\u6587~']), ('\u4e2d\u6587 \u5b57\u7b26 hello world', 5, 2, '~', ['\u4e2d\u6587', '\u5b57\u7b26~']), ('\u4e2d\u6587\u5b57\u7b26 hello', 12, 1, '...', ['\u4e2d\u6587\u5b57\u7b26...']), -] - - -@pytest.mark.parametrize('text,w,ml,ph,expected', MAX_LINES_SEQUENCE_CASES) +]) def test_wrap_max_lines_sequences(text, w, ml, ph, expected): assert wrap(text, w, max_lines=ml, placeholder=ph) == expected @@ -494,19 +463,14 @@ def test_wrap_max_lines_hyperlink_close_on_prev_line(): assert result == [f'{OSC_START_ST}ab{OSC_END_ST}...'] -# -- expand_tabs, replace_whitespace, fix_sentence_endings -- - -STDLIB_PARAM_CASES = [ +@pytest.mark.parametrize('text,kwargs', [ ('hello\tworld', {'width': 20, 'expand_tabs': False, 'replace_whitespace': False}), ('hello\tworld foo\tbar baz', {'width': 12, 'expand_tabs': False, 'tabsize': 8}), ('hello\nworld', {'width': 20, 'replace_whitespace': False}), ('a\t b\n c', {'width': 20, 'replace_whitespace': False}), ('Hello world. This is a test. More text.', {'width': 20, 'fix_sentence_endings': True}), ('Dr. Smith went to Washington. He left.', {'width': 20, 'fix_sentence_endings': True}), -] - - -@pytest.mark.parametrize('text,kwargs', STDLIB_PARAM_CASES) +]) def test_wrap_stdlib_params(text, kwargs): assert wrap(text, **kwargs) == textwrap.wrap(text, **kwargs) diff --git a/tests/test_ucslevel.py b/tests/test_ucslevel.py index 979cfe0f..9aea2c9b 100644 --- a/tests/test_ucslevel.py +++ b/tests/test_ucslevel.py @@ -1,4 +1,5 @@ """Unicode version level tests for wcwidth.""" + # local import wcwidth diff --git a/tests/test_width.py b/tests/test_width.py index 7c45f138..8e43b47b 100644 --- a/tests/test_width.py +++ b/tests/test_width.py @@ -1,4 +1,5 @@ """Tests for width() function.""" + # 3rd party import pytest @@ -45,8 +46,10 @@ def test_width_control_codes_ignore(text, expected, name): ('hello\x7fworld', 'DEL'), ('hello\x80world', 'C1_control'), ('hello\nworld', 'LF'), + ('hello\rworld', 'CR'), ('hello\x1b[Hworld', 'cursor_home'), ('hello\x1b[Aworld', 'cursor_up'), + ('hello\x1b[5Gworld', 'hpa'), ] @@ -61,9 +64,9 @@ def test_width_control_codes_strict_raises(text, name): ('hello\x07world', 10, 'BEL'), ('hello\x00world', 10, 'NUL'), ('abc\bd', 3, 'backspace'), - ('abc\rxy', 3, 'CR'), ('\x1b[31mred\x1b[0m', 3, 'SGR_sequence'), ('a\x1b[2Cb', 4, 'cursor_right'), + ('ab\x1b[Db', 2, 'cursor_left'), ('\x1b', 0, 'lone_ESC'), ('a\x1bb', 1, 'fs_sequence_between'), ('\x1b!', 1, 'ESC_unrecognized'), @@ -107,6 +110,11 @@ def test_width_strict_indeterminate_raises(seq, cap_name): ('abcd\x1b[2De', 4, 'cursor_left'), ('\x1b[31mred\x1b[0m', 3, 'SGR'), ('ab\x1b[Hcd', 4, 'indeterminate'), + ('def\x1b[3Dabc', 3, 'cursor_left_overwrite'), + ('def\x1b[10Dabc', 3, 'cursor_left_past_start'), + ('abc\x1b[5Gde', 6, 'hpa_parse'), + ('abc\x1b[Gde', 3, 'hpa_no_param'), + ('\x1b[5Gabc', 7, 'hpa_before_text'), ] @@ -191,29 +199,26 @@ def test_vs16_selector(): def test_zwj_with_non_emoji_chars(): - """ZWJ with non-emoji characters and trailing VS16.""" - # ZWJ (Zero Width Joiner) skips both itself and the following character, treating them as a - # failed emoji ZWJ sequence. When followed by VS16, the VS16 should NOT apply to the earlier - # emoji because VS16 must immediately follow the character it modifies. - # - # In the full parse loop, VS16 checks `last_measured_idx == idx - 1` (immediate adjacency). - # The ZWJ+char skip means VS16 is not adjacent to the smiley, so VS16 has no effect. - # + """ + ZWJ with non-emoji characters and trailing VS16. + + These are invalid Unicode sequences (ZWJ followed by non-emoji), so behavior is implementation- + defined. The emoji base (smiley, width 1) is narrow, and VS16 looks back to it across the ZWJ- + consumed characters, adding 1 cell for a total width of 2. + """ # Control test, assert wcwidth.width("\u263A\uFE0F") == 2 # smiley + VS16 = 2 - # ZWJ followed by non-emoji, VS16 does not apply (not adjacent) - assert wcwidth.width("\u263A\u200Da\uFE0F") == 1 - assert wcwidth.width("\u263A\u200Dx\uFE0F") == 1 - assert wcwidth.width("\u263A\u200Da\u200Db\uFE0F") == 1 + # ZWJ followed by non-emoji: VS16 applies to the smiley base + assert wcwidth.width("\u263A\u200Da\uFE0F") == 2 + assert wcwidth.width("\u263A\u200Dx\uFE0F") == 2 + assert wcwidth.width("\u263A\u200Da\u200Db\uFE0F") == 2 # ZWJ at end of string assert wcwidth.width("\u263A\u200D") == 1 # smiley + ZWJ = 1 # Long strings (>20 chars) use fast path which routes to wcswidth(). - # wcswidth() has more lenient VS16 handling, causing VS16 to incorrectly apply (!) - # Multiply by 10 to exceed threshold: "\u263A\u200Da\uFE0F" (4 chars) * 10 = 40 chars - assert wcwidth.width("\u263A\u200Da\uFE0F" * 10) == 20 # (smiley(1) + ZWJ+a(0) + VS16(+1)) * 10 (!) + assert wcwidth.width("\u263A\u200Da\uFE0F" * 10) == 20 def test_vs16_after_control_chars(): @@ -229,10 +234,9 @@ def test_vs16_after_control_chars(): assert wcwidth.width("\u263A\x0d\uFE0F") == 1 # smiley(1) + CR(reset) + VS16(0), extent=1 # Long strings (>20 chars) use fast path which routes to wcswidth(). - # wcswidth() has more lenient VS16 handling (`last_measured_idx >= 0` vs `== idx - 1`), - # causing VS16 to incorrectly apply when separated by control chars (!) + # In ignore mode, BEL is stripped, so VS16 is adjacent to the smiley and applies correctly. # Multiply by 10 to exceed threshold - assert wcwidth.width(("\u263A\x07\uFE0F") * 10) == 20 # (smiley(1) + BEL(0) + VS16(+1)) * 10 (!) + assert wcwidth.width(("\u263A\x07\uFE0F") * 10) == 20 # (smiley(1) + BEL-stripped(0) + VS16(+1)) * 10 def test_width_long_horizontal_fastpath(): @@ -267,6 +271,42 @@ def test_carriage_return_resets_column(): assert wcwidth.width('abc\rde') == 3 +def test_carriage_return_strict_raises(): + """CR in strict mode raises ValueError (indeterminate starting column).""" + with pytest.raises(ValueError, match='Horizontal movement'): + wcwidth.width('hello\rworld', control_codes='strict') + + +def test_hpa_parse_best_effort(): + """HPA in parse mode assumes string begins at column 0.""" + assert wcwidth.width('abc\x1b[5Gde') == 6 + assert wcwidth.width('abc\x1b[Gde') == 3 + assert wcwidth.width('\x1b[10Ghi') == 11 + + +def test_hpa_strict_raises(): + """HPA in strict mode raises ValueError (indeterminate starting column).""" + with pytest.raises(ValueError, match='horizontal position'): + wcwidth.width('abc\x1b[5Gde', control_codes='strict') + + +def test_cursor_left_strict_out_of_bounds(): + """Cursor-left beyond string start raises ValueError in strict mode.""" + with pytest.raises(ValueError, match='Cursor left movement'): + wcwidth.width('a\x1b[5Da', control_codes='strict') + + +def test_cursor_left_out_of_bounds_parse_no_raise(): + """Cursor-left beyond string start is silently clamped in parse mode.""" + assert wcwidth.width('a\x1b[5Da') == 1 + assert wcwidth.width('abc\x1b[99Ddef') == 3 # 99D clamped to col 0, then b,c,d overwritten + + +def test_cursor_left_out_of_bounds_ignore_mode(): + """Cursor-left beyond string start is zero-width in ignore mode.""" + assert wcwidth.width('a\x1b[5Da', control_codes='ignore') == 2 + + def test_iter_sequences_lone_esc(): """Lone ESC is yielded as a sequence.""" assert list(wcwidth.iter_sequences('\x1b')) == [('\x1b', True)] diff --git a/tox.ini b/tox.ini index 2915bedb..8c5d19a4 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = update, fetch, compile, autopep8, docformatter, docformatter_check, isort, isort_check, pylint, pylint_tests, flake8, flake8_tests, pydocstyle, mypy, codespell, format, lint, docs, verify_tables, py{38, 39, 310, 311, 312, 313, 314}, pypy{38, 39, 310, 311} +envlist = update, fetch, compile, autopep8, docformatter, docformatter_check, isort, isort_check, pylint, pylint_tests, flake8, flake8_tests, pydocstyle, mypy, codespell, format, lint, docs, verify_tables, py{38, 39, 310, 311, 312, 313, 314} skip_missing_interpreters = true [base] @@ -8,10 +8,9 @@ pip_compile_command = pip-compile --resolver=backtracking --strip-extras --no-em [testenv] deps = -r requirements-tests39.txt commands = {envpython} -m pytest --cov-config={toxinidir}/tox.ini {posargs:\ - -n auto \ --verbose \ --junit-xml=.tox/results.{envname}.xml \ - --durations=3 \ + --durations=10 \ } \ --log-format='%(levelname)s %(relativeCreated)2.2f %(filename)s:%(lineno)d %(message)s' \ tests @@ -62,22 +61,21 @@ precision = 1 [coverage:paths] source = wcwidth/ -# wcwidth itself has no 3rd party dependencies, but to ensure the best available -# version for the newest to oldest python versions for testing, must also use some -# targeted versions to 'compile' those requirements into their frozen form, -# otherwise incompatible packages would be pinned. At the time of this writing the -# files compiled for version 3.9 and later are compiled by python3.13 [WIP]. [testenv:compile] -basepython = python3.13 +basepython = python3.14 commands = python -m compileall {toxinidir}/wcwidth {toxinidir}/bin {toxinidir}/tests {toxinidir}/docs +# wcwidth itself has no 3rd party dependencies, but to ensure the best available +# version for the newest to oldest python versions for testing, must also use some +# targeted versions to 'compile' those requirements into their frozen form, +# otherwise incompatible packages can be pinned. [testenv:update_requirements_update] -basepython = python3.13 +basepython = python3.14 deps = pip-tools commands = {[base]pip_compile_command} requirements-update.in -o requirements-update.txt [testenv:update_requirements_docs] -basepython = python3.12 +basepython = python3.14 deps = pip-tools commands = {[base]pip_compile_command} requirements-docs.in -o docs/requirements.txt @@ -94,11 +92,9 @@ commands = {[base]pip_compile_command} requirements-tests38.in -o requirements-t [testenv:py38] deps = -r requirements-tests38.txt -[testenv:pypy38] -deps = -r requirements-tests38.txt - [testenv:update] -basepython = python3.14 +# it is best to use latest python for latest 'unicodedata' for named table items in generated code +basepython = python3.15 usedevelop = true deps = -r requirements-update.txt commands = python {toxinidir}/bin/update-tables.py {posargs:--fetch-all-versions} @@ -140,7 +136,7 @@ commands = flake8 --exclude=tests docs/ wcwidth/ bin/ tests/ [testenv:docs] # matches .readthedocs.yaml and environment -basepython = python3.12 +basepython = python3.14 deps = -r {toxinidir}/docs/requirements.txt commands = sphinx-build -W docs/ build/sphinx @@ -169,9 +165,8 @@ commands = pydocstyle --source --explain {toxinidir}/wcwidth [testenv:docformatter] basepython = python3.13 -deps = docformatter>=1.7.7 - untokenize -commands = - docformatter --in-place --recursive --pre-summary-newline \ +deps = docformatter==1.7.7 +commands = docformatter --in-place --recursive --pre-summary-newline \ --wrap-summaries=100 --wrap-descriptions=100 \ {toxinidir}/wcwidth/ {toxinidir}/bin {toxinidir}/tests/ @@ -183,17 +178,17 @@ commands = docformatter --check --diff --recursive --pre-summary-newline \ {toxinidir}/wcwidth/ {toxinidir}/bin {toxinidir}/tests/ [testenv:isort_check] -basepython = python3.13 +basepython = python3.14 deps = isort commands = isort --diff --check-only wcwidth tests bin [testenv:flake8_tests] -basepython = python3.13 +basepython = python3.14 deps = flake8 commands = flake8 --ignore=E501,W504,F401 tests/ [testenv:pylint_tests] -basepython = python3.13 +basepython = python3.14 deps = pytest pylint commands = pylint --rcfile={toxinidir}/.pylintrc \ @@ -211,7 +206,7 @@ warn_redundant_casts = true warn_unused_ignores = true [testenv:codespell] -basepython = python3.13 +basepython = python3.14 deps = codespell commands = codespell --skip="*.pyc,htmlcov,_build,build,*.egg-info,.tox,data,./tests/*.txt,*.csv,*.ods,table_*.py,docs/specs.rst,*.isorted" \ --ignore-words-list="thirdparty,claus,oclock,womens,aprox" \ @@ -228,16 +223,19 @@ commands = {[testenv:isort]commands} {[testenv:autopep8]commands} [testenv:lint] -basepython = python3.13 +basepython = python3.14 deps = {[testenv:flake8]deps} + {[testenv:mypy]deps} {[testenv:isort_check]deps} {[testenv:pydocstyle]deps} {[testenv:pylint_tests]deps} {[testenv:codespell]deps} -commands = {[testenv:flake8]commands} - {[testenv:flake8_tests]commands} - {[testenv:isort_check]commands} - {[testenv:pydocstyle]commands} +commands = {[testenv:compile]commands} + {[testenv:flake8]commands} + {[testenv:mypy]commands} {[testenv:pylint]commands} + {[testenv:flake8_tests]commands} {[testenv:pylint_tests]commands} {[testenv:codespell]commands} + {[testenv:pydocstyle]commands} + {[testenv:isort_check]commands} diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 400c8a61..2bab0b50 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -1,43 +1,50 @@ """ -Wcwidth module. +Python 'wcwidth' module. https://github.com/jquast/wcwidth """ -# re-export all functions & definitions, even private ones, from top-level -# module path, to allow for 'from wcwidth import _private_func'. Of course, -# user beware that any _private functions or variables not exported by __all__ -# may disappear or change signature at any future version. + +# re-export common and outermost functions & definitions, even a few private +# ones, some for convenience, others for legacy, only the items in __all__ are +# documented as public API # local -from .wcwidth import ZERO_WIDTH # noqa -from .wcwidth import (WIDE_EASTASIAN, - AMBIGUOUS_EASTASIAN, - VS16_NARROW_TO_WIDE, - clip, - ljust, - rjust, - width, - center, - wcwidth, - wcswidth, - list_versions, - iter_sequences, - strip_sequences, - _wcmatch_version, - _wcversion_value) +from ._clip import clip +from .align import ljust, rjust, center +from ._width import width from .bisearch import bisearch as _bisearch -from .grapheme import grapheme_boundary_before # noqa -from .grapheme import iter_graphemes, iter_graphemes_reverse +from .grapheme import iter_graphemes, iter_graphemes_reverse, grapheme_boundary_before from .textwrap import SequenceTextWrapper, wrap +from ._wcswidth import wcswidth +from .hyperlink import Hyperlink, HyperlinkParams from .sgr_state import propagate_sgr +from .table_vs16 import VS16_NARROW_TO_WIDE +from .table_wide import WIDE_EASTASIAN +from .table_zero import ZERO_WIDTH +from .table_ambiguous import AMBIGUOUS_EASTASIAN +from .escape_sequences import iter_sequences, strip_sequences +from .unicode_versions import list_versions + +# Pre-import the legacy submodule so that sys.modules['wcwidth.wcwidth'] is +# populated during package initialization. This matches the 0.6.0 behavior +# where ``from .wcwidth import wcwidth`` would have already loaded the +# submodule. Without this, a later ``import wcwidth.wcwidth`` triggers +# on-disk file discovery which rebinds wcwidth.wcwidth from the function to +# the module object. +# +# NOTE: this sort order is important for legacy import API compatibility before release 0.7.0 +from . import wcwidth as _wcwidth_module # isort:skip +from ._wcwidth import wcwidth, _wcmatch_version, _wcversion_value # isort:skip + # The __all__ attribute defines the items exported from statement, # 'from wcwidth import *', but also to say, "This is the public API". __all__ = ('wcwidth', 'wcswidth', 'width', 'iter_sequences', 'iter_graphemes', 'iter_graphemes_reverse', 'grapheme_boundary_before', 'ljust', 'rjust', 'center', 'wrap', 'clip', 'strip_sequences', - 'list_versions', 'propagate_sgr') + 'list_versions', 'propagate_sgr', 'Hyperlink', 'HyperlinkParams') # Using 'hatchling', it does not seem to provide the pyproject.toml nicety, "dynamic = ['version']" # like flit_core, maybe there is some better way but for now we have to duplicate it in both places -__version__ = '0.6.0' +# Prefer the installed distribution version when available (helps test environments) +__version__ = '0.7.0' # don't forget to also update pyproject.toml:version diff --git a/wcwidth/_clip.py b/wcwidth/_clip.py new file mode 100644 index 00000000..aa268073 --- /dev/null +++ b/wcwidth/_clip.py @@ -0,0 +1,675 @@ +"""This is a python implementation of clip().""" +from __future__ import annotations + +# std imports +import enum + +from typing import Literal, Optional, NamedTuple + +# local +from ._width import width +from .grapheme import iter_graphemes +from .hyperlink import Hyperlink, HyperlinkParams +from .sgr_state import (_SGR_STATE_DEFAULT, + _SGRState, + _sgr_state_update, + _sgr_state_is_active, + _sgr_state_to_sequence) +from .escape_sequences import (_SEQUENCE_CLASSIFY, + _HORIZONTAL_CURSOR_MOVEMENT, + INDETERMINATE_EFFECT_SEQUENCE) + + +class _HyperlinkAction(enum.Enum): + """Outcome of processing an OSC 8 hyperlink unit.""" + + NO_CLOSE = enum.auto() # open sequence without matching close + EMPTY = enum.auto() # hyperlink with no visible inner text + OUTSIDE = enum.auto() # hyperlink entirely outside the clip window + VISIBLE = enum.auto() # hyperlink overlaps the clip window + + +class _HyperlinkResult(NamedTuple): + """ + Result of processing an OSC 8 hyperlink. + + Only the fields relevant to each action are populated. + """ + + action: _HyperlinkAction + close_end: int = 0 + inner_width: int = 0 + open_seq: str = '' + clipped_inner: str = '' + close_seq: str = '' + clipped_width: int = 0 + hl_col_end: int = 0 + + +def _apply_sgr_wrap(result: str, captured_style: Optional[_SGRState]) -> str: + """ + Apply SGR prefix/suffix around *result*. + + If an SGR state was captured at the first visible character, prefix the result with the + corresponding SGR sequence and suffix with a reset if any styles are active. + """ + if captured_style is not None: + if prefix := _sgr_state_to_sequence(captured_style): + result = prefix + result + if _sgr_state_is_active(captured_style): + result += '\x1b[0m' + return result + + +def _process_hyperlink( + text: str, + start: int, + end: int, + fillchar: str, + tabsize: int, + ambiguous_width: int, + control_codes: Literal['parse', 'strict', 'ignore'], + *, + params: HyperlinkParams, + match_end: int, + col: int, +) -> _HyperlinkResult: + """ + Process an OSC 8 hyperlink unit. + + Finds the matching close sequence, measures the inner text width, and determines whether the + hyperlink is empty, outside the clip window, or visible (requiring inner-text clipping). + """ + # pylint: disable=too-many-locals,too-many-positional-arguments + close_start, close_end = Hyperlink.find_close(text, match_end) + if (close_start, close_end) == (-1, -1): + return _HyperlinkResult(_HyperlinkAction.NO_CLOSE) + inner_text = text[match_end:close_start] + inner_width = width( + inner_text, control_codes=control_codes, + tabsize=tabsize, ambiguous_width=ambiguous_width, + ) + + if inner_width == 0: + return _HyperlinkResult(_HyperlinkAction.EMPTY, close_end=close_end) + + hl_col_end = col + inner_width + + if hl_col_end <= start or col >= end: + return _HyperlinkResult(_HyperlinkAction.OUTSIDE, close_end=close_end, + inner_width=inner_width) + + inner_clip_start = max(0, start - col) + inner_clip_end = end - col + + clipped_inner = clip( + inner_text, inner_clip_start, inner_clip_end, + fillchar=fillchar, tabsize=tabsize, + ambiguous_width=ambiguous_width, + propagate_sgr=False, + control_codes=control_codes, + ) + + clipped_width = width( + clipped_inner, control_codes=control_codes, + tabsize=tabsize, ambiguous_width=ambiguous_width, + ) + + return _HyperlinkResult( + _HyperlinkAction.VISIBLE, + close_end=close_end, + inner_width=inner_width, + open_seq=params.make_open(), + clipped_inner=clipped_inner, + close_seq=params.make_close(), + clipped_width=clipped_width, + hl_col_end=hl_col_end, + ) + + +def _reconstruct_painter( + cells: dict[int, tuple[str, int]], + sequences: list[tuple[int, int, str]], + start: int, + end: int, + fillchar: str, +) -> str: + """ + Reconstruct the output string from painter's algorithm state. + + Walks columns left-to-right, interleaving escape sequences and cell content, filling gaps with + *fillchar*. + """ + # pylint: disable=too-many-locals + # Group and sort sequences by column, preserving insertion order within each. + seqs_by_col: dict[int, list[tuple[int, str]]] = {} + for col_pos, order, seq_text in sequences: + seqs_by_col.setdefault(col_pos, []).append((order, seq_text)) + for entries in seqs_by_col.values(): + entries.sort() + + max_cell_col = max(cells.keys()) if cells else -1 + max_seq_col = max(seqs_by_col.keys()) if seqs_by_col else -1 + max_col = max(max_cell_col, max_seq_col) + + parts: list[str] = [] + walk_col = 0 + col_limit = min(max_col, end) + while walk_col <= col_limit: + # Emit any sequences anchored at this column. + for _, seq_text in seqs_by_col.get(walk_col, ()): + parts.append(seq_text) + + if walk_col >= end: + walk_col += 1 + continue + + if walk_col in cells: + cell_text, cell_w = cells[walk_col] + parts.append(cell_text) + walk_col += cell_w + else: + if start <= walk_col <= max_cell_col: + parts.append(fillchar) + walk_col += 1 + + # Emit sequences anchored beyond the visible region. + for c in sorted(seqs_by_col.keys()): + if c > col_limit: + for _, seq_text in seqs_by_col[c]: + parts.append(seq_text) + + return ''.join(parts) + + +def _clip_simple( + text: str, + start: int, + end: int, + *, + propagate_sgr: bool, + ambiguous_width: int, + fillchar: str, + tabsize: int, + strict: bool, + control_codes: Literal['parse', 'strict', 'ignore'], +) -> tuple[str, Optional[_SGRState]]: + """ + Clip text without cursor movement (simple append-to-output path). + + Returns ``(result, captured_style)``. The caller applies SGR wrapping. + """ + # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements + # pylint: disable=too-many-nested-blocks + # code length and complexity traded for performance, to allow this to be used as a "hot path" + + output: list[str] = [] + col = 0 + idx = 0 + # captured_style is a frozen snapshot of current_style taken at the first + # visible character emitted within the clip window (start, end). It stays + # None until that point. current_style, by contrast, is continuously + # updated by SGR sequences throughout the scan. The snapshot is what the + # caller uses to wrap the result in the correct SGR state. + # + # When propagate_sgr is False, current_style (and therefore captured_style) + # remain None, and SGR sequences pass through as literal text. + captured_style: Optional[_SGRState] = None + current_style = _SGR_STATE_DEFAULT if propagate_sgr else None + + while idx < len(text): + char = text[idx] + + # Early exit: past visible region. + if col >= end and char not in '\r\x08\t\x1b': + if captured_style is not None: + break + # propagate_sgr is always False here: with propagate_sgr=True, + # captured_style is set on the first visible emission in the + # clip window and we would have broken above. The skip-ahead + # optimization is only needed (and safe) when SGR tracking is off. + next_esc = text.find('\x1b', idx + 1) + if next_esc == -1: + break + idx = next_esc + continue + + if char == '\x1b': + m = _SEQUENCE_CLASSIFY.match(text, idx) + if not m: + output.append(char) + idx += 1 + continue + + # SGR: update current_style, do not emit. + if m.group('sgr_params') is not None and propagate_sgr and current_style is not None: + current_style = _sgr_state_update(current_style, m.group()) + idx = m.end() + continue + + # OSC 8 hyperlink. + if hl_state := HyperlinkParams.parse(m.group()): + r = _process_hyperlink( + text, start, end, fillchar, tabsize, ambiguous_width, + control_codes, + params=hl_state, match_end=m.end(), col=col, + ) + if r.action is _HyperlinkAction.NO_CLOSE: + output.append(m.group()) + idx = m.end() + elif r.action is _HyperlinkAction.EMPTY: + idx = r.close_end + elif r.action is _HyperlinkAction.OUTSIDE: + col += r.inner_width + idx = r.close_end + else: + output.append(r.open_seq) + output.append(r.clipped_inner) + output.append(r.close_seq) + if propagate_sgr and captured_style is None: + captured_style = current_style + col += r.inner_width + idx = r.close_end + continue + + # Indeterminate-effect sequences: raise in strict mode. + seq = m.group() + if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): + raise ValueError( + f"Indeterminate cursor sequence at position {idx}, " + f"{seq!r}" + ) + + # Any other recognized sequence: preserve as-is. + output.append(seq) + idx = m.end() + continue + + if char == '\t': + # Expand tab, filling clip window with spaces. + if tabsize > 0: + next_tab = col + (tabsize - (col % tabsize)) + while col < next_tab: + if start <= col < end: + output.append(' ') + if propagate_sgr and captured_style is None: + captured_style = current_style + col += 1 + else: + output.append('\t') + idx += 1 + continue + + grapheme = next(iter_graphemes(text, start=idx)) + grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) + + # Emit grapheme or fillchar depending on visibility within clip window. + if grapheme_w == 0: + if start <= col < end: + output.append(grapheme) + elif col >= start and col + grapheme_w <= end: + output.append(grapheme) + if propagate_sgr and captured_style is None: + captured_style = current_style + elif col < end and col + grapheme_w > start: + output.append(fillchar * (min(end, col + grapheme_w) - max(start, col))) + if propagate_sgr and captured_style is None: + captured_style = current_style + + col += grapheme_w + idx += len(grapheme) + + return ''.join(output), captured_style + + +def _clip_painter( + text: str, + start: int, + end: int, + *, + propagate_sgr: bool, + ambiguous_width: int, + fillchar: str, + tabsize: int, + strict: bool, + control_codes: Literal['parse', 'strict', 'ignore'], +) -> tuple[str, Optional[_SGRState]]: + """ + Clip text with cursor movement (painter's algorithm path). + + Returns ``(result, captured_style)``. The caller applies SGR wrapping. + """ + # pylint: disable=too-complex,too-many-locals,too-many-branches + # pylint: disable=too-many-statements,too-many-nested-blocks + # code length and complexity traded for performance, to allow this to be used as a "hot path" + + cells: dict[int, tuple[str, int]] = {} + hyperlink_cells: set[int] = set() + sequences: list[tuple[int, int, str]] = [] + seq_order = 0 + + col = 0 + idx = 0 + # captured_style is a frozen snapshot of current_style taken at the first + # visible character emitted within the clip window (start, end). It stays + # None until that point. current_style, by contrast, is continuously + # updated by SGR sequences throughout the scan. + # + # When propagate_sgr is False, current_style (and therefore captured_style) + # remain None, and SGR sequences pass through as literal text. + captured_style: Optional[_SGRState] = None + current_style = _SGR_STATE_DEFAULT if propagate_sgr else None + + def _write_cells(s: str, w: int, write_col: int, + is_hyperlink: bool = False) -> None: + """Write *w* cells of text *s* at *write_col*, handling wide-char splitting.""" + nonlocal captured_style + for offset in range(w): + src_col = write_col + offset + if src_col > 0 and cells.get(src_col - 1, ('', 0))[1] == 2: + cells[src_col - 1] = (fillchar, 1) + hyperlink_cells.discard(src_col - 1) + if cells.get(src_col, ('', 0))[1] == 2: + cells[src_col + 1] = (fillchar, 1) + hyperlink_cells.discard(src_col + 1) + cells.pop(src_col, None) + hyperlink_cells.discard(src_col) + cells[write_col] = (s, w) + if is_hyperlink: + for offset in range(w): + hyperlink_cells.add(write_col + offset) + if propagate_sgr and captured_style is None: + captured_style = current_style + + while idx < len(text): + char = text[idx] + + # Early exit: past visible region, SGR captured, no escape ahead. + if col >= end and captured_style is not None and char != '\x1b': + break + + if char == '\x1b': + m = _SEQUENCE_CLASSIFY.match(text, idx) + if not m: + # Record lone ESC as a zero-width sequence at current column. + sequences.append((col, seq_order, char)) + seq_order += 1 + if propagate_sgr and captured_style is None: + captured_style = current_style + idx += 1 + continue + + # SGR: update current_style, do not emit. + if m.group('sgr_params') is not None and propagate_sgr and current_style is not None: + current_style = _sgr_state_update(current_style, m.group()) + idx = m.end() + continue + + # OSC 8 hyperlink. + if hl_state := HyperlinkParams.parse(m.group()): + r = _process_hyperlink( + text, start, end, fillchar, tabsize, ambiguous_width, + control_codes, + params=hl_state, match_end=m.end(), col=col, + ) + if r.action is _HyperlinkAction.NO_CLOSE: + sequences.append((col, seq_order, m.group())) + seq_order += 1 + if propagate_sgr and captured_style is None: + captured_style = current_style + idx = m.end() + elif r.action is _HyperlinkAction.EMPTY: + idx = r.close_end + elif r.action is _HyperlinkAction.OUTSIDE: + col += r.inner_width + idx = r.close_end + else: + sequences.append((col, seq_order, r.open_seq)) + seq_order += 1 + if propagate_sgr and captured_style is None: + captured_style = current_style + _write_cells(r.clipped_inner, r.clipped_width, col, + is_hyperlink=True) + col += r.clipped_width + sequences.append((col, seq_order, r.close_seq)) + seq_order += 1 + col = r.hl_col_end + idx = r.close_end + continue + + # Indeterminate-effect sequences: raise in strict mode. + seq = m.group() + if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): + raise ValueError( + f"Indeterminate cursor sequence at position {idx}, " + f"{seq!r}" + ) + + # Horizontal Position Absolute (CSI n G). + if (hpa_n := m.group('hpa_n')) is not None: + col = int(hpa_n) - 1 if hpa_n else 0 + idx = m.end() + continue + + # Cursor Forward (CSI n C). + if (cforward_n := m.group('cforward_n')) is not None: + n_forward = int(cforward_n) if cforward_n else 1 + move_end = col + n_forward + if col < end and move_end > start: + for i in range(max(col, start), min(move_end, end)): + _write_cells(fillchar, 1, i) + col = move_end + idx = m.end() + continue + + # Cursor Backward (CSI n D). + if (cbackward_n := m.group('cbackward_n')) is not None: + n_backward = int(cbackward_n) if cbackward_n else 1 + if strict and n_backward > col: + raise ValueError( + f"Cursor left movement at position {idx} would move " + f"{n_backward} cells left from column {col}, " + f"exceeding string start" + ) + col = max(0, col - n_backward) + idx = m.end() + continue + + # Any other recognized sequence: preserve as-is. + sequences.append((col, seq_order, m.group())) + seq_order += 1 + if propagate_sgr and captured_style is None: + captured_style = current_style + idx = m.end() + continue + + # Carriage return. + if char == '\r': + col = 0 + idx += 1 + continue + + # Backspace. + if char == '\x08': + if col > 0: + col -= 1 + idx += 1 + continue + + # Tab expansion. + if char == '\t': + if tabsize > 0: + next_tab = col + (tabsize - (col % tabsize)) + while col < next_tab: + if start <= col < end: + _write_cells(fillchar, 1, col) + col += 1 + else: + sequences.append((col, seq_order, '\t')) + seq_order += 1 + if propagate_sgr and captured_style is None: + captured_style = current_style + idx += 1 + continue + + # Grapheme cluster. + grapheme = next(iter_graphemes(text, start=idx)) + grapheme_w = width(grapheme, ambiguous_width=ambiguous_width) + + # Emit grapheme or fillchar depending on visibility within clip window. + if grapheme_w == 0: + if start <= col < end: + sequences.append((col, seq_order, grapheme)) + seq_order += 1 + if propagate_sgr and captured_style is None: + captured_style = current_style + elif col >= start and col + grapheme_w <= end: + _write_cells(grapheme, grapheme_w, col) + elif col < end and col + grapheme_w > start: + clip_start = max(start, col) + for offset in range(min(end, col + grapheme_w) - clip_start): + _write_cells(fillchar, 1, clip_start + offset) + + col += grapheme_w + idx += len(grapheme) + + return _reconstruct_painter(cells, sequences, start, end, fillchar), captured_style + + +def clip( + text: str, + start: int, + end: int, + *, + fillchar: str = ' ', + tabsize: int = 8, + ambiguous_width: int = 1, + propagate_sgr: bool = True, + control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', + overtyping: Optional[bool] = None, +) -> str: + r""" + Clip text to display columns ``(start, end)`` while preserving all terminal sequences. + + This function extracts a substring based on visible column positions rather than + character indices. Terminal escape sequences are preserved in the output since + they have zero display width. If a wide character (width 2) is split at + either boundary, it is replaced with ``fillchar``. + + TAB characters (``\t``) are expanded to spaces up to the next tab stop, + controlled by the ``tabsize`` parameter. When cursor movement is detected, + a "painter's algorithm" is used, cursor movements actively change the write + position, allowing cursor-left and carriage return to overwrite previously + written cells. It is assumed that ``text`` begins at column 0. + + **OSC 8 hyperlinks** are handled specially: the visible text inside a hyperlink + is clipped to the requested column range, and the hyperlink is rebuilt around + the clipped text. Empty hyperlinks (those with no remaining visible text after + clipping) are removed:: + + >>> clip('\x1b]8;;http://example.com\x07Click This link\x1b]8;;\x07', 6, 10) + '\x1b]8;;http://example.com\x07This\x1b]8;;\x07' + + :param text: String to clip, may contain terminal escape sequences. + :param start: Absolute starting column (inclusive, 0-indexed). + :param end: Absolute ending column (exclusive). + :param fillchar: Character to use when a wide character must be split at + a boundary (default space). Must have display width of 1. + :param tabsize: Tab stop width (default 8). Set to 0 to pass tabs through + as zero-width (preserved in output but don't advance column position). + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :param propagate_sgr: If True (default), SGR (terminal styling) sequences + are propagated. The result begins with any active style at the start + position and ends with a reset sequence if styles are active. + :param control_codes: How to handle control characters and sequences: + + - ``'parse'`` (default): Track horizontal cursor movement and clip + hyperlink text. Cursor overwrite is always allowed, with best effort + results; indeterminate sequences (home, clear, reset, etc.) are + preserved as zero-width. + - ``'strict'``: Like ``parse``, but raises :exc:`ValueError` on + sequences with indeterminate effects (cursor home, clear screen, + reset, vertical movement, etc.) matching :func:`width` behavior. + Also raises on out-of-bounds horizontal cursor movement. + - ``'ignore'``: All control characters are treated as zero-width. + Cursor movement is not tracked (fastest path). + + :param overtyping: Whether to use the painter's algorithm for cursor + movement (``\b`` backspace, ``\r`` carriage return, and CSI cursor + left/right/position sequences). When ``None`` (default), auto-detects + by scanning for these characters in *text*. Set to ``False`` for improved + performance when the caller knows *text* contains no cursor movement + characters. Set to ``True`` to force the painter's algorithm (useful + for testing). Has no effect when ``control_codes='ignore'``. + + :returns: Substring of ``text`` spanning display columns ``(start, end)``, + with all terminal sequences preserved and wide characters at boundaries + replaced with ``fillchar``. + + :raises ValueError: If ``control_codes='strict'`` and an indeterminate-effect + sequence or out-of-bounds cursor movement is encountered. + + SGR (terminal styling) sequences are propagated by default. The result + begins with any active style and ends with a reset:: + + >>> clip('\x1b[1;34mHello world\x1b[0m', 6, 11) + '\x1b[1;34mworld\x1b[0m' + + Set ``propagate_sgr=False`` to disable this behavior. + + .. versionadded:: 0.3.0 + + .. versionchanged:: 0.5.0 + Added ``propagate_sgr`` parameter (default True). + + .. versionchanged:: 0.7.0 + Added ``control_codes`` parameter (default 'parse'). + OSC 8 hyperlink-aware clipping. + Added ``overtyping`` parameter (default None, auto-detect). + + Example:: + + >>> clip('hello world', 0, 5) + 'hello' + >>> clip('中文字', 0, 3) # Wide char split at column 3 + '中 ' + >>> clip('a\tb', 0, 10) # Tab expanded to spaces + 'a b' + """ + start = max(start, 0) + if end <= start: + return '' + + # Fast path: printable ASCII only. + if text.isascii() and text.isprintable(): + return text[start:end] + + # No escape sequences => no SGR tracking needed. + has_esc = '\x1b' in text + if propagate_sgr and not has_esc: + propagate_sgr = False + + # Determine whether painter's algorithm is needed. + if overtyping is None: + # Auto-detect: scan for cursor movement characters. + overtyping = ( + control_codes != 'ignore' and + ('\x08' in text or '\r' in text or + (has_esc and bool(_HORIZONTAL_CURSOR_MOVEMENT.search(text)))) + ) + elif overtyping and control_codes == 'ignore': + overtyping = False # control_codes='ignore' overrides + fn_clip = _clip_painter if overtyping else _clip_simple + + return _apply_sgr_wrap(*fn_clip( + text=text, + start=start, + end=end, + propagate_sgr=propagate_sgr, + ambiguous_width=ambiguous_width, + fillchar=fillchar, + tabsize=tabsize, + strict=(control_codes == 'strict'), + control_codes=control_codes, + )) diff --git a/wcwidth/_constants.py b/wcwidth/_constants.py new file mode 100644 index 00000000..7c2b627c --- /dev/null +++ b/wcwidth/_constants.py @@ -0,0 +1,65 @@ +"""Shared data tables and constants for wcwidth.py, _wcwidth.py, and _wcswidth.py.""" + +# local +from .table_mc import CATEGORY_MC +from .table_wide import WIDE_EASTASIAN +from .table_zero import ZERO_WIDTH +from .table_grapheme import EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR +from .table_ambiguous import AMBIGUOUS_EASTASIAN +from .unicode_versions import list_versions + +__all__ = ( + "_REGIONAL_INDICATOR_SET", + "_ISC_VIRAMA_SET", + "_LATEST_VERSION", + "_CATEGORY_MC_TABLE", + "_EMOJI_ZWJ_SET", + "_FITZPATRICK_RANGE", + "_ZERO_WIDTH_TABLE", + "_WIDE_EASTASIAN_TABLE", + "_AMBIGUOUS_TABLE", +) + +_REGIONAL_INDICATOR_SET = frozenset( + range(GRAPHEME_REGIONAL_INDICATOR[0][0], GRAPHEME_REGIONAL_INDICATOR[0][1] + 1) +) +_ISC_VIRAMA_SET = frozenset(( + 0x094D, # DEVANAGARI SIGN VIRAMA + 0x09CD, # BENGALI SIGN VIRAMA + 0x0A4D, # GURMUKHI SIGN VIRAMA + 0x0ACD, # GUJARATI SIGN VIRAMA + 0x0B4D, # ORIYA SIGN VIRAMA + 0x0BCD, # TAMIL SIGN VIRAMA + 0x0C4D, # TELUGU SIGN VIRAMA + 0x0CCD, # KANNADA SIGN VIRAMA + 0x0D4D, # MALAYALAM SIGN VIRAMA + 0x0DCA, # SINHALA SIGN AL-LAKUNA + 0x1B44, # BALINESE ADEG ADEG + 0xA806, # SYLOTI NAGRI SIGN HASANTA + 0xA8C4, # SAURASHTRA SIGN VIRAMA + 0xA9C0, # JAVANESE PANGKON + 0x11046, # BRAHMI VIRAMA + 0x110B9, # KAITHI SIGN VIRAMA + 0x111C0, # SHARADA SIGN VIRAMA + 0x11235, # KHOJKI SIGN VIRAMA + 0x1134D, # GRANTHA SIGN VIRAMA + 0x11442, # NEWA SIGN VIRAMA + 0x114C2, # TIRHUTA SIGN VIRAMA + 0x115BF, # SIDDHAM SIGN VIRAMA + 0x1163F, # MODI SIGN VIRAMA + 0x116B6, # TAKRI SIGN VIRAMA + 0x11839, # DOGRA SIGN VIRAMA + 0x119E0, # NANDINAGARI SIGN VIRAMA + 0x11C3F, # BHAIKSUKI SIGN VIRAMA +)) +# pylint: disable=invalid-name +_LATEST_VERSION = list_versions()[-1] +_CATEGORY_MC_TABLE = CATEGORY_MC[_LATEST_VERSION] +_EMOJI_ZWJ_SET = frozenset( + cp for lo, hi in EXTENDED_PICTOGRAPHIC for cp in range(lo, hi + 1) +) | _REGIONAL_INDICATOR_SET +_FITZPATRICK_RANGE = (0x1F3FB, 0x1F3FF) + +_ZERO_WIDTH_TABLE = ZERO_WIDTH[_LATEST_VERSION] +_WIDE_EASTASIAN_TABLE = WIDE_EASTASIAN[_LATEST_VERSION] +_AMBIGUOUS_TABLE = AMBIGUOUS_EASTASIAN[_LATEST_VERSION] diff --git a/wcwidth/_wcswidth.py b/wcwidth/_wcswidth.py new file mode 100644 index 00000000..eab9b740 --- /dev/null +++ b/wcwidth/_wcswidth.py @@ -0,0 +1,150 @@ +"""This is a python implementation of wcswidth().""" + +from __future__ import annotations + +from typing import Optional + +# local +from ._wcwidth import wcwidth +from .bisearch import bisearch +from ._constants import (_EMOJI_ZWJ_SET, + _ISC_VIRAMA_SET, + _CATEGORY_MC_TABLE, + _FITZPATRICK_RANGE, + _REGIONAL_INDICATOR_SET) +from .table_vs16 import VS16_NARROW_TO_WIDE +from .table_grapheme import ISC_CONSONANT + + +def wcswidth( + pwcs: str, + n: Optional[int] = None, + unicode_version: str = 'auto', + ambiguous_width: int = 1, +) -> int: + """ + Given a unicode string, return its printable length on a terminal. + + See :ref:`Specification` for details of cell measurement. + + This implementation differs from Markus Khun's original POSIX C implementation, in that this + ``wcswidth()`` processes graphemes strings yielded by :func:`wcwidth.iter_graphemes` defined by + `Unicode Standard Annex #29`_. POSIX wcswidth(3) is not grapheme-aware and does not measure many + kinds of Emojis or complex scripts correctly. + + :param pwcs: Measure width of given unicode string. + :param n: When ``n`` is None (default), return the length of the entire + string, otherwise only the first ``n`` characters are measured. + + :param unicode_version: Ignored. Retained for backwards compatibility. + + .. deprecated:: 0.3.0 + Only the latest Unicode version is now shipped. + + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :returns: The width, in cells, needed to display the first ``n`` characters + of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control + characters! + + .. _`Unicode Standard Annex #29`: https://www.unicode.org/reports/tr29/ + """ + # pylint: disable=unused-argument,too-many-locals,too-many-statements + # pylint: disable=too-complex,too-many-branches,duplicate-code + # This function intentionally keeps all logic inline for performance. + + # Fast path: pure ASCII printable strings are always width == length + if n is None and pwcs.isascii() and pwcs.isprintable(): + return len(pwcs) + + # Select wcwidth call pattern for best lru_cache performance + _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) + + end = len(pwcs) if n is None else n + total_width = 0 + idx = 0 + + # grapheme-clustering state + last_measured_idx = -2 + last_measured_ucs = -1 + last_was_virama = False + conjunct_pending = False + + while idx < end: + char = pwcs[idx] + ucs = ord(char) + + # ZWJ (U+200D) + if ucs == 0x200D: + if last_was_virama: + idx += 1 + elif idx + 1 < end: + last_was_virama = False + idx += 2 + else: + last_was_virama = False + idx += 1 + continue + + # VS16 (U+FE0F): converts preceding narrow character to wide. + if ucs == 0xFE0F and last_measured_idx >= 0: + total_width += bisearch( + ord(pwcs[last_measured_idx]), + VS16_NARROW_TO_WIDE['9.0.0'], + ) + last_measured_idx = -2 # prevent double application + idx += 1 + continue + + # Regional Indicator & Fitzpatrick (both above BMP) + if ucs > 0xFFFF: + if ucs in _REGIONAL_INDICATOR_SET: + ri_before = 0 + j = idx - 1 + while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET: + ri_before += 1 + j -= 1 + if ri_before % 2 == 1: + last_measured_ucs = ucs + idx += 1 + continue + elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] + and last_measured_ucs in _EMOJI_ZWJ_SET): + idx += 1 + continue + + # Virama conjunct formation + if last_was_virama and bisearch(ucs, ISC_CONSONANT): + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + conjunct_pending = True + idx += 1 + continue + + # Normal character: measure with wcwidth + w = _wcwidth(char) + if w < 0: + # C0/C1 control character + return -1 + if w > 0: + if conjunct_pending: + total_width += 1 + conjunct_pending = False + total_width += w + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + elif last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE): + # Spacing Combining Mark (Mc) following a base character adds 1 + total_width += 1 + last_measured_idx = -2 + last_was_virama = False + conjunct_pending = False + else: + last_was_virama = ucs in _ISC_VIRAMA_SET + idx += 1 + + if conjunct_pending: + total_width += 1 + return total_width diff --git a/wcwidth/_wcwidth.py b/wcwidth/_wcwidth.py new file mode 100644 index 00000000..c055fb78 --- /dev/null +++ b/wcwidth/_wcwidth.py @@ -0,0 +1,158 @@ +""" +This is a python implementation of wcwidth() and wcswidth(). + +https://github.com/jquast/wcwidth + +Derived from Markus Kuhn's C code, + +This is an implementation of wcwidth() and wcswidth() (defined in +IEEE Std 1002.1-2001) for Unicode. + +http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html +http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html + +In fixed-width output devices, Latin characters all occupy a single +"cell" position of equal width, whereas ideographic CJK characters +occupy two such cells. Interoperability between terminal-line +applications and (teletype-style) character terminals using the +UTF-8 encoding requires agreement on which character should advance +the cursor by how many cell positions. No established formal +standards exist at present on which Unicode character shall occupy +how many cell positions on character terminals. These routines are +a first attempt of defining such behavior based on simple rules +applied to data provided by the Unicode Consortium. + +For some graphical characters, the Unicode standard explicitly +defines a character-cell width via the definition of the East Asian +FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. +In all these cases, there is no ambiguity about which width a +terminal shall use. For characters in the East Asian Ambiguous (A) +class, the width choice depends purely on a preference of backward +compatibility with either historic CJK or Western practice. +Choosing single-width for these characters is easy to justify as +the appropriate long-term solution, as the CJK practice of +displaying these characters as double-width comes from historic +implementation simplicity (8-bit encoded characters were displayed +single-width and 16-bit ones double-width, even for Greek, +Cyrillic, etc.) and not any typographic considerations. + +Much less clear is the choice of width for the Not East Asian +(Neutral) class. Existing practice does not dictate a width for any +of these characters. It would nevertheless make sense +typographically to allocate two character cells to characters such +as for instance EM SPACE or VOLUME INTEGRAL, which cannot be +represented adequately with a single-width glyph. The following +routines at present merely assign a single-cell width to all +neutral characters, in the interest of simplicity. This is not +entirely satisfactory and should be reconsidered before +establishing a formal standard in this area. At the moment, the +decision which Not East Asian (Neutral) characters should be +represented by double-width glyphs cannot yet be answered by +applying a simple rule from the Unicode database content. Setting +up a proper standard for the behavior of UTF-8 character terminals +will require a careful analysis not only of each Unicode character, +but also of each presentation form, something the author of these +routines has avoided to do so far. + +http://www.unicode.org/unicode/reports/tr11/ + +Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c +""" + +from __future__ import annotations + +# std imports +from functools import lru_cache + +# local +from .bisearch import bisearch +from ._constants import _LATEST_VERSION, _AMBIGUOUS_TABLE, _ZERO_WIDTH_TABLE, _WIDE_EASTASIAN_TABLE + + +@lru_cache(maxsize=128) +def _wcversion_value(ver_string: str) -> tuple[int, ...]: # pragma: no cover + """ + Integer-mapped value of given dotted version string. + + .. deprecated:: 0.3.0 + + This function is no longer used internally by wcwidth but is retained + for API compatibility with external tools. + + :param ver_string: Unicode version string, of form ``n.n.n``. + :returns: tuple of digit tuples, ``tuple(int, [...])``. + """ + retval = tuple(map(int, (ver_string.split('.')))) + return retval + + +@lru_cache(maxsize=8) +def _wcmatch_version(given_version: str) -> str: # pylint: disable=unused-argument + """ + Return the supported Unicode version level. + + .. deprecated:: 0.3.0 + This function now always returns the latest version. + + This function is no longer used internally by wcwidth but is retained + for API compatibility with external tools. + + :param given_version: Ignored. Any value is accepted for compatibility. + :returns: The latest unicode version string. + """ + return _LATEST_VERSION + + +# maxsize=1024: western scripts need ~64 unique codepoints per session, but +# CJK sessions may use ~2000 of ~3500 common hanzi/kanji. 1024 accommodates +# heavy CJK use. Performance floor at 32; bisearch is ~100ns per miss. + +@lru_cache(maxsize=1024) +def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> int: # pylint: disable=unused-argument + r""" + Given one Unicode codepoint, return its printable length on a terminal. + + :param wc: A single Unicode character. + :param unicode_version: Ignored. Retained for backwards compatibility. + + .. deprecated:: 0.3.0 + Only the latest Unicode version is now shipped. + + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts + where ambiguous characters display as double-width. See + :ref:`ambiguous_width` for details. + :returns: The width, in cells, necessary to display the character of + Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has + no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is + not printable, or has an indeterminate effect on the terminal, such as + a control character. Otherwise, the number of column positions the + character occupies on a graphic terminal (1 or 2) is returned. + + See :ref:`Specification` for details of cell measurement. + """ + ucs = ord(wc) if wc else 0 + + # small optimization: early return of 1 for printable ASCII, this provides + # approximately 40% performance improvement for mostly-ascii documents, with + # less than 1% impact to others. + if 32 <= ucs < 0x7f: + return 1 + + # C0/C1 control characters are -1 for compatibility with POSIX-like calls + if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0: + return -1 + + # Zero width + if bisearch(ucs, _ZERO_WIDTH_TABLE): + return 0 + + # Wide (F/W categories) + if bisearch(ucs, _WIDE_EASTASIAN_TABLE): + return 2 + + # Ambiguous width (A category) - only when ambiguous_width=2 + if ambiguous_width == 2 and bisearch(ucs, _AMBIGUOUS_TABLE): + return 2 + + return 1 diff --git a/wcwidth/_width.py b/wcwidth/_width.py new file mode 100644 index 00000000..b8d7cda1 --- /dev/null +++ b/wcwidth/_width.py @@ -0,0 +1,319 @@ +"""This is a high-level width() supporting terminal output.""" + +from typing import Literal + +# local +from ._wcwidth import wcwidth +from .bisearch import bisearch +from ._wcswidth import wcswidth +from ._constants import (_EMOJI_ZWJ_SET, + _ISC_VIRAMA_SET, + _CATEGORY_MC_TABLE, + _FITZPATRICK_RANGE, + _REGIONAL_INDICATOR_SET) +from .table_vs16 import VS16_NARROW_TO_WIDE +from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL +from .table_grapheme import ISC_CONSONANT +from .escape_sequences import (_SEQUENCE_CLASSIFY, + CURSOR_MOVEMENT_SEQUENCE, + INDETERMINATE_EFFECT_SEQUENCE, + strip_sequences) + +# In 'parse' mode, strings longer than this are checked for cursor-movement +# controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to +# 'ignore' to skip character-by-character parsing. The detection scan cost is +# negligible for long strings but wasted on short ones like labels or headings. +_WIDTH_FAST_PATH_MIN_LEN = 20 + +# Translation table to strip C0/C1 control characters for fast 'ignore' mode. +_CONTROL_CHAR_TABLE = str.maketrans('', '', ( + ''.join(chr(c) for c in range(0x00, 0x20)) + # C0: NUL through US (including tab) + '\x7f' + # DEL + ''.join(chr(c) for c in range(0x80, 0xa0)) # C1: U+0080-U+009F +)) + + +def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int: + """ + Fast path for width() with control_codes='ignore'. + + Strips escape sequences and control characters, then measures remaining text. + """ + return wcswidth( + strip_sequences(text).translate(_CONTROL_CHAR_TABLE), + ambiguous_width=ambiguous_width + ) + + +def width( + text: str, + *, + control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', + tabsize: int = 8, + ambiguous_width: int = 1, +) -> int: + r""" + Return printable width of text containing many kinds of control codes and sequences. + + Unlike :func:`wcswidth`, this function handles most control characters and many popular terminal + output sequences. Never returns -1. + + :param text: String to measure. + :param control_codes: How to handle control characters and sequences: + + - ``'parse'`` (default): Track horizontal cursor movement like BS ``\b``, CR ``\r``, TAB + ``\t``, cursor left and right movement sequences. Vertical movement (LF, VT, FF) and + indeterminate terminal sequences are zero-width. OSC 8 Hyperlink, and many other kinds + of output sequences are parsed for displayed measurements. + - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with + indeterminate results of the screen or cursor, like clear or vertical movement. Generally, + these should be handled with a virtual terminal emulator (like 'pyte'). + - ``'ignore'``: All C0 and C1 control characters and escape sequences are measured as + width 0. This is the fastest measurement for text already filtered or known not to contain + any kinds of control codes or sequences. TAB ``\t`` is zero-width; to ensure + tab expansion, pre-process text using :func:`str.expandtabs`. + + :param tabsize: Tab stop width for ``'parse'`` and ``'strict'`` modes. Default is 8. + Must be positive. Has no effect when ``control_codes='ignore'``. + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :returns: Maximum cursor position reached, "extent", accounting for cursor movement sequences + present in ``text`` according to given parameters. This represents the rightmost column the + cursor reaches. Always a non-negative integer. + + :raises ValueError: If ``control_codes='strict'`` and control characters with indeterminate + effects, such as vertical movement or clear sequences are encountered, or on unexpected + C0 or C1 control code. Also raised when ``control_codes`` is not one of the valid values. + + .. versionadded:: 0.3.0 + + .. versionchanged:: 0.7.0 + Expanded strict-mode to raise :exc:`ValueError` when cursor-left movement + (CSI D) would move beyond the beginning of the string. Previously, cursor-left + was silently clamped to column 0 in all modes. + + Examples:: + + >>> width('hello') + 5 + >>> width('コンニチハ') + 10 + >>> width('\x1b[31mred\x1b[0m') + 3 + >>> width('\x1b[31mred\x1b[0m', control_codes='ignore') # same result (ignored) + 3 + >>> width('123\b4') # backspace overwrites previous cell (outputs '124') + 3 + >>> width('abc\t') # tab caused cursor to move to column 8 + 8 + >>> width('1\x1b[10C') # '1' + cursor right 10, cursor ends on column 11 + 11 + >>> width('1\x1b[10C', control_codes='ignore') # faster but wrong in this case + 1 + """ + # pylint: disable=too-complex,too-many-branches,too-many-statements,too-many-locals + # This could be broken into sub-functions (#1, #3, and #6 especially), but for reduced overhead + # in consideration of this function a likely "hot path", they are inline, breaking many pylint + # complexity rules. + + # Fast path for ASCII printable (no tabs, escapes, or control chars) + if text.isascii() and text.isprintable(): + return len(text) + + # Fast parse: if no horizontal cursor movements are possible, switch to 'ignore' mode. + # Only check longer strings - the detection overhead hurts short string performance. + if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN: + # Check for cursor-affecting control characters + if '\b' not in text and '\t' not in text and '\r' not in text: + # Check for escape sequences - if none contain cursor movement + if '\x1b' not in text or not CURSOR_MOVEMENT_SEQUENCE.search(text): + control_codes = 'ignore' + + # Fast path for ignore mode, useful if you know the text is already free of control codes + if control_codes == 'ignore': + return _width_ignored_codes(text, ambiguous_width) + + strict = control_codes == 'strict' + # Track absolute positions: tab stops need modulo on absolute column, CR resets to 0. + # Initialize max_extent to 0 so backward movement (CR, BS) won't yield negative width. + current_col = 0 + max_extent = 0 + idx = 0 + text_len = len(text) + + # Select wcwidth call pattern for best lru_cache performance: + # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls + # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) + _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) + + # grapheme-clustering state + last_measured_idx = -2 + last_measured_ucs = -1 + last_was_virama = False + conjunct_pending = False + + while idx < text_len: + char = text[idx] + + # 1. ESC sequences + if char == '\x1b': + m = _SEQUENCE_CLASSIFY.match(text, idx) + if not m: + # 1a. Errant ESC or unknown sequence: only the first character is zero-width + idx += 1 + else: + seq = m.group() + if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): + raise ValueError(f"Indeterminate cursor sequence at position {idx}, {seq!r}") + + # 2b. horizontal position absolute (before forward/backward to + # avoid other_seq match in _SEQUENCE_CLASSIFY) + if (hpa_n := m.group('hpa_n')) is not None: + target_col = int(hpa_n) if hpa_n else 1 + if strict: + raise ValueError( + f"Indeterminate horizontal position at position {idx}, " + f"{seq!r} (absolute column unknown)" + ) + current_col = target_col - 1 # HPA is 1-indexed, convert to 0-indexed + # 2c. cursor forward, backward + elif (cforward_n := m.group('cforward_n')) is not None: + current_col += int(cforward_n) if cforward_n else 1 + elif (cbackward_n := m.group('cbackward_n')) is not None: + n_backward = int(cbackward_n) if cbackward_n else 1 + if strict and n_backward > current_col: + raise ValueError( + f"Cursor left movement at position {idx} would move " + f"{n_backward} cells left from column {current_col}, " + f"exceeding string start" + ) + current_col = max(0, current_col - n_backward) + # 2d. SGR and other zero-width sequences -- no column advance + idx = m.end() + # Escape sequences break VS16 adjacency: reset last-measured state + last_measured_idx = -2 + last_measured_ucs = -1 + max_extent = max(max_extent, current_col) + continue + + # 2. Vertical or Illegal control characters zero width or error when 'strict' + if char in ILLEGAL_CTRL: + if strict: + raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}") + idx += 1 + last_measured_idx = -2 + last_measured_ucs = -1 + continue + + if char in VERTICAL_CTRL: + if strict: + raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}") + idx += 1 + last_measured_idx = -2 + last_measured_ucs = -1 + continue + + # 3. Horizontal movement characters + if char in HORIZONTAL_CTRL: + if char == '\x09' and tabsize > 0: # Tab + current_col += tabsize - (current_col % tabsize) + elif char == '\x08': # Backspace + if current_col > 0: + current_col -= 1 + elif char == '\x0d': # Carriage return + if strict: + raise ValueError( + f"Horizontal movement character \\r at position {idx}: " + "indeterminate starting column" + ) + current_col = 0 + max_extent = max(max_extent, current_col) + idx += 1 + last_measured_idx = -2 + last_measured_ucs = -1 + continue + + # 4. Zero-width control characters + if char in ZERO_WIDTH_CTRL: + idx += 1 + last_measured_idx = -2 + last_measured_ucs = -1 + continue + + # 5. Inline grapheme-clustering: ZWJ, VS16, Regional Indicators, + # Fitzpatrick, Virama conjuncts, Mc, wcwidth + ucs = ord(char) + + # ZWJ (U+200D) + if ucs == 0x200D: + if last_was_virama: + idx += 1 + elif idx + 1 < text_len: + last_was_virama = False + idx += 2 + else: + last_was_virama = False + idx += 1 + continue + + # VS16 (U+FE0F): converts preceding narrow character to wide. + if ucs == 0xFE0F and last_measured_idx >= 0: + if bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE['9.0.0']): + current_col += 1 + max_extent = max(max_extent, current_col) + last_measured_idx = -2 # prevent double application + idx += 1 + continue + + # Regional Indicator & Fitzpatrick (both above BMP) + if ucs > 0xFFFF: + if ucs in _REGIONAL_INDICATOR_SET: + ri_before = 0 + j = idx - 1 + while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET: + ri_before += 1 + j -= 1 + if ri_before % 2 == 1: + last_measured_ucs = ucs + idx += 1 + continue + elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] + and last_measured_ucs in _EMOJI_ZWJ_SET): + idx += 1 + continue + + # Virama conjunct formation + if last_was_virama and bisearch(ucs, ISC_CONSONANT): + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + conjunct_pending = True + idx += 1 + continue + + # Normal character: measure with wcwidth + w = _wcwidth(char) + if w > 0: + if conjunct_pending: + current_col += 1 + conjunct_pending = False + current_col += w + max_extent = max(max_extent, current_col) + last_measured_idx = idx + last_measured_ucs = ucs + last_was_virama = False + elif last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE): + # Spacing Combining Mark (Mc) following a base character adds 1 + current_col += 1 + max_extent = max(max_extent, current_col) + last_measured_idx = -2 + last_was_virama = False + conjunct_pending = False + else: + last_was_virama = ucs in _ISC_VIRAMA_SET + idx += 1 + + if conjunct_pending: + current_col += 1 + max_extent = max(max_extent, current_col) + return max_extent diff --git a/wcwidth/align.py b/wcwidth/align.py new file mode 100644 index 00000000..328454bb --- /dev/null +++ b/wcwidth/align.py @@ -0,0 +1,136 @@ +"""Python grapheme, emoji, and sequence-aware ljust, rjust, center().""" +from typing import Literal + +# local +from ._width import width + + +def ljust( + text: str, + dest_width: int, + fillchar: str = ' ', + *, + control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', + ambiguous_width: int = 1, +) -> str: + r""" + Return text left-justified in a string of given display width. + + :param text: String to justify, may contain terminal sequences. + :param dest_width: Total display width of result in terminal cells. + :param fillchar: Single character for padding (default space). Must have + display width of 1 (not wide, not zero-width, not combining). Unicode + characters like ``'·'`` are acceptable. The width is not validated. + :param control_codes: How to handle control sequences when measuring. + Passed to :func:`width` for measurement. + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :returns: Text padded on the right to reach ``dest_width``. + + .. versionadded:: 0.3.0 + + Example:: + + >>> wcwidth.ljust('hi', 5) + 'hi ' + >>> wcwidth.ljust('\x1b[31mhi\x1b[0m', 5) + '\x1b[31mhi\x1b[0m ' + >>> wcwidth.ljust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) + '👨‍👩‍👧 ' + """ + if text.isascii() and text.isprintable(): + text_width = len(text) + else: + text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) + padding_cells = max(0, dest_width - text_width) + return text + fillchar * padding_cells + + +def rjust( + text: str, + dest_width: int, + fillchar: str = ' ', + *, + control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', + ambiguous_width: int = 1, +) -> str: + r""" + Return text right-justified in a string of given display width. + + :param text: String to justify, may contain terminal sequences. + :param dest_width: Total display width of result in terminal cells. + :param fillchar: Single character for padding (default space). Must have + display width of 1 (not wide, not zero-width, not combining). Unicode + characters like ``'·'`` are acceptable. The width is not validated. + :param control_codes: How to handle control sequences when measuring. + Passed to :func:`width` for measurement. + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :returns: Text padded on the left to reach ``dest_width``. + + .. versionadded:: 0.3.0 + + Example:: + + >>> wcwidth.rjust('hi', 5) + ' hi' + >>> wcwidth.rjust('\x1b[31mhi\x1b[0m', 5) + ' \x1b[31mhi\x1b[0m' + >>> wcwidth.rjust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) + ' 👨‍👩‍👧' + """ + if text.isascii() and text.isprintable(): + text_width = len(text) + else: + text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) + padding_cells = max(0, dest_width - text_width) + return fillchar * padding_cells + text + + +def center( + text: str, + dest_width: int, + fillchar: str = ' ', + *, + control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', + ambiguous_width: int = 1, +) -> str: + r""" + Return text centered in a string of given display width. + + :param text: String to center, may contain terminal sequences. + :param dest_width: Total display width of result in terminal cells. + :param fillchar: Single character for padding (default space). Must have + display width of 1 (not wide, not zero-width, not combining). Unicode + characters like ``'·'`` are acceptable. The width is not validated. + :param control_codes: How to handle control sequences when measuring. + Passed to :func:`width` for measurement. + :param ambiguous_width: Width to use for East Asian Ambiguous (A) + characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. + :returns: Text padded on both sides to reach ``dest_width``. + + For odd-width padding, the extra cell fills in the same cell position as + Python's :meth:`str.center` behavior (the left side when ``dest_width`` is + odd, the right side when ``dest_width`` is even). + See `the eccentric str.center `_. + + .. versionadded:: 0.3.0 + + Example:: + + >>> wcwidth.center('hi', 6) + ' hi ' + >>> wcwidth.center('\x1b[31mhi\x1b[0m', 6) + ' \x1b[31mhi\x1b[0m ' + >>> wcwidth.center('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) + ' 👨‍👩‍👧 ' + """ + if text.isascii() and text.isprintable(): + text_width = len(text) + else: + text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) + total_padding = max(0, dest_width - text_width) + # matching https://jazcap53.github.io/pythons-eccentric-strcenter.html + left_pad = total_padding // 2 + (total_padding & dest_width & 1) + right_pad = total_padding - left_pad + return fillchar * left_pad + text + fillchar * right_pad diff --git a/wcwidth/bisearch.py b/wcwidth/bisearch.py index becfe86a..e95c51b8 100644 --- a/wcwidth/bisearch.py +++ b/wcwidth/bisearch.py @@ -1,4 +1,5 @@ """Binary search function for Unicode interval tables.""" + from __future__ import annotations @@ -7,8 +8,8 @@ def bisearch(ucs: int, table: tuple[tuple[int, int], ...]) -> int: Binary search in interval table. :param ucs: Ordinal value of unicode character. - :param table: Tuple of starting and ending ranges of ordinal values, - in form of ``((start, end), ...)``. + :param table: Tuple of starting and ending ranges of ordinal values, in form of ``((start, end), + ...)``. :returns: 1 if ordinal value ucs is found within lookup table, else 0. """ lbound = 0 diff --git a/wcwidth/escape_sequences.py b/wcwidth/escape_sequences.py index 9e37e5ac..9c296de3 100644 --- a/wcwidth/escape_sequences.py +++ b/wcwidth/escape_sequences.py @@ -5,9 +5,15 @@ sequences that begin with ESC (``\x1b``). Before calling re.match with these patterns, callers should first check that the character at the current position is ESC for optimal performance. """ + # std imports import re +import typing + +# local +from .sgr_state import _SGR_PATTERN + # Zero-width escape sequences (SGR, OSC, CSI, etc.). This table, like INDETERMINATE_EFFECT_SEQUENCE, # originated from the 'blessed' library. ZERO_WIDTH_PATTERN = re.compile( @@ -39,6 +45,30 @@ # Cursor left movement: CSI [n] D, parameter may be parsed by width() CURSOR_LEFT_SEQUENCE = re.compile(r'\x1b\[(\d*)D') +# Horizontal position absolute: CSI [n] G, parameter may be parsed by width() +CURSOR_HPA_SEQUENCE = re.compile(r'\x1b\[(\d*)G') + +# Combined cursor movement: single regex for fast-path detection of any +# horizontal cursor movement (left, right, hpa). Avoids two separate search() +# calls in hot-path width() and clip() pre-checks. +CURSOR_MOVEMENT_SEQUENCE = re.compile(r'\x1b\[(\d*)[CDG]') + +# Combined horizontal cursor movement: matches BS, CR, and CSI C/D/G cursor sequences +# in a single regex pass. Used by clip() to decide between the simple append path +# and the painter's algorithm. +_HORIZONTAL_CURSOR_MOVEMENT = re.compile(r'[\x08\r]|\x1b\[(\d*)[CDG]') + +# Combined pattern: a single regex that matches any zero-width escape sequence +# and classifies it via named groups, aprox 2x faster than redundant re.matches +# in clip() and width(). +_SEQUENCE_CLASSIFY = re.compile( + _SGR_PATTERN.pattern.replace('(', '(?P', 1) + + '|' + CURSOR_HPA_SEQUENCE.pattern.replace('(', '(?P', 1) + + '|' + CURSOR_RIGHT_SEQUENCE.pattern.replace('(', '(?P', 1) + + '|' + CURSOR_LEFT_SEQUENCE.pattern.replace('(', '(?P', 1) + + '|' + r'(?P(?:' + ZERO_WIDTH_PATTERN.pattern + '))' +) + # Indeterminate effect sequences - raise ValueError in 'strict' mode. The effects of these sequences # are likely to be undesirable, moving the cursor vertically or to any unknown position, and # otherwise not managed by the 'width' method of this library. @@ -51,7 +81,6 @@ r'\x1b\[\d+;\d+r', # change_scroll_region r'\x1b\[\d*K', # erase_in_line (clr_eol, clr_bol) r'\x1b\[\d*J', # erase_in_display (clr_eos, erase_display) - r'\x1b\[\d*G', # column_address r'\x1b\[\d+;\d+H', # cursor_address r'\x1b\[\d*H', # cursor_home r'\x1b\[\d*A', # cursor_up @@ -72,3 +101,80 @@ r'\x1bc', # full_reset (RIS) )) ) + + +def iter_sequences(text: str) -> typing.Iterator[typing.Tuple[str, bool]]: + r""" + Iterate through text, yielding segments with sequence identification. + + This generator yields tuples of ``(segment, is_sequence)`` for each part + of the input text, where ``is_sequence`` is ``True`` if the segment is + a recognized terminal escape sequence. + + :param text: String to iterate through. + :returns: Iterator of (segment, is_sequence) tuples. + + .. versionadded:: 0.3.0 + + Example:: + + >>> list(iter_sequences('hello')) + [('hello', False)] + >>> list(iter_sequences('\x1b[31mred')) + [('\x1b[31m', True), ('red', False)] + >>> list(iter_sequences('\x1b[1m\x1b[31m')) + [('\x1b[1m', True), ('\x1b[31m', True)] + """ + idx = 0 + text_len = len(text) + segment_start = 0 + + while idx < text_len: + char = text[idx] + + if char == '\x1b': + # Yield any accumulated non-sequence text + if idx > segment_start: + yield (text[segment_start:idx], False) + + # Try to match an escape sequence + match = ZERO_WIDTH_PATTERN.match(text, idx) + if match: + yield (match.group(), True) + idx = match.end() + else: + # Lone ESC or unrecognized - yield as sequence anyway + yield (char, True) + idx += 1 + segment_start = idx + else: + idx += 1 + + # Yield any remaining text + if segment_start < text_len: + yield (text[segment_start:], False) + + +def strip_sequences(text: str) -> str: + r""" + Return text with all terminal escape sequences removed. + + Unknown or incomplete ESC sequences are preserved. + + :param text: String that may contain terminal escape sequences. + :returns: The input text with all escape sequences stripped. + + .. versionadded:: 0.3.0 + + Example:: + + >>> strip_sequences('\x1b[31mred\x1b[0m') + 'red' + >>> strip_sequences('hello') + 'hello' + >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text') + 'bold red text' + >>> strip_sequences('\x1b]8;id=34;https://example.com\x1b\\[view]\x1b]8;;\x1b\\') + '[view]' + """ + return ZERO_WIDTH_PATTERN.sub('', text) diff --git a/wcwidth/grapheme.py b/wcwidth/grapheme.py index 7befc920..cdfde222 100644 --- a/wcwidth/grapheme.py +++ b/wcwidth/grapheme.py @@ -13,7 +13,7 @@ from enum import IntEnum from functools import lru_cache -from typing import TYPE_CHECKING, NamedTuple +from typing import TYPE_CHECKING, Optional, NamedTuple # local from .bisearch import bisearch as _bisearch @@ -130,7 +130,7 @@ class BreakResult(NamedTuple): @lru_cache(maxsize=1024) -def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> BreakResult | None: +def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> Optional[BreakResult]: """ Check simple GCB-pair-based break rules (cacheable). @@ -248,7 +248,7 @@ def _should_break( def iter_graphemes( unistr: str, start: int = 0, - end: int | None = None, + end: Optional[int] = None, ) -> Iterator[str]: r""" Iterate over grapheme clusters in a Unicode string. @@ -390,7 +390,7 @@ def grapheme_boundary_before(unistr: str, pos: int) -> int: def iter_graphemes_reverse( unistr: str, start: int = 0, - end: int | None = None, + end: Optional[int] = None, ) -> Iterator[str]: r""" Iterate over grapheme clusters in reverse order (last to first). diff --git a/wcwidth/hyperlink.py b/wcwidth/hyperlink.py new file mode 100644 index 00000000..da7a3aa0 --- /dev/null +++ b/wcwidth/hyperlink.py @@ -0,0 +1,142 @@ +""" +OSC 8 hyperlink parsing and measurement. + +.. versionadded:: 0.7.0 +""" + +from __future__ import annotations + +# std imports +import re + +import typing + +# local +from ._width import width as _width +from .escape_sequences import _SEQUENCE_CLASSIFY + +HYPERLINK_OPEN_RE = re.compile(r'\x1b]8;([^;]*);([^\x07\x1b]*)(\x07|\x1b\\)') +HYPERLINK_CLOSE_RE = re.compile(r'\x1b]8;;(\x07|\x1b\\)') + + +class HyperlinkParams(typing.NamedTuple): + r""" + Parsed parameters from an OSC 8 hyperlink open sequence. + + :param url: The hyperlink URL. + :param params: Colon-separated metadata string (often empty). + :param terminator: Sequence terminator (``\x07`` or ``\x1b\\``). + """ + + url: str + params: str = '' + terminator: str = '\x07' + + @classmethod + def parse(cls, seq: str) -> HyperlinkParams | None: + r""" + Parse an OSC 8 open sequence string. + + Returns ``None`` if *seq* is not a valid OSC 8 open. + + Example:: + + >>> HyperlinkParams.parse('\x1b]8;;http://example.com\x07') + HyperlinkParams(url='http://example.com', params='', terminator='\\x07') + """ + m = HYPERLINK_OPEN_RE.match(seq) + if m is None: + return None + return cls(url=m.group(2), params=m.group(1), terminator=m.group(3)) + + def make_open(self) -> str: + """Generate the OSC 8 open escape sequence.""" + return f'\x1b]8;{self.params};{self.url}{self.terminator}' + + def make_close(self) -> str: + """Generate the OSC 8 close escape sequence.""" + return f'\x1b]8;;{self.terminator}' + + +class Hyperlink(typing.NamedTuple): + """ + A complete OSC 8 hyperlink with target and inner text. + + :param params: Parsed open sequence parameters. + :param text: Inner text between the open and close sequences. + """ + + params: HyperlinkParams + text: str + + @classmethod + def find_close(cls, text: str, open_end: int) -> tuple[int, int]: + """ + Find the matching OSC 8 close sequence. + + Searches 'text' starting at 'open_end', the position just past the open + sequence. Returns position of close sequence ``(close_start, + close_end)`` or ``(-1, -1)`` if not found. + + Per the OSC 8 specification, terminal emulators treat hyperlinks as a + state attribute, not as nested HTML anchors. A close sequence closes + the current hyperlink regardless of how many open sequences preceded it. + """ + m = HYPERLINK_CLOSE_RE.search(text, open_end) + if m is None: + return (-1, -1) + return (m.start(), m.end()) + + def display_width( + self, + *, + control_codes: typing.Literal['parse', 'strict', 'ignore'] = 'parse', + tabsize: int = 8, + ambiguous_width: int = 1, + ) -> int: + r""" + Measure the display width of the hyperlink's inner text. + + Delegates to :func:`wcwidth.width` with the given parameters. + + Example:: + + >>> hl = Hyperlink.parse('\x1b]8;;http://example.com\x07Hello\x1b]8;;\x07', 0) + >>> hl.display_width() + 5 + """ + return _width( + self.text, + control_codes=control_codes, + tabsize=tabsize, + ambiguous_width=ambiguous_width, + ) + + @classmethod + def parse(cls, text: str, start: int = 0) -> Hyperlink | None: + r""" + Parse a complete OSC 8 hyperlink unit from *text* at position *start*. + + Locates the open sequence, finds the matching close, and returns a + ``Hyperlink`` containing the parsed parameters and inner text. Returns + ``None`` if the text at *start* is not a complete OSC 8 hyperlink. + + Example:: + + >>> Hyperlink.parse('\x1b]8;;http://example.com\x07Hello\x1b]8;;\x07') + Hyperlink(params=HyperlinkParams(url='http://example.com', ...), text='Hello') + """ + m = _SEQUENCE_CLASSIFY.match(text, start) + if m is None: + return None + params = HyperlinkParams.parse(m.group()) + if params is None: + return None + close_start, close_end = cls.find_close(text, m.end()) + if (close_start, close_end) == (-1, -1): + return None + return cls(params=params, text=text[m.end():close_start]) + + def make_sequence(self) -> str: + """Rebuild the complete OSC 8 hyperlink escape sequence.""" + return self.params.make_open() + self.text + self.params.make_close() diff --git a/wcwidth/sgr_state.py b/wcwidth/sgr_state.py index b0c86484..8e6e5ccf 100644 --- a/wcwidth/sgr_state.py +++ b/wcwidth/sgr_state.py @@ -5,6 +5,7 @@ etc.) via public API propagate_sgr(), and its dependent functions, cut() and wrap(). It only has attributes necessary to perform its functions, eg 'RED' and 'BLUE' attributes are not defined. """ + from __future__ import annotations # std imports @@ -307,7 +308,7 @@ def propagate_sgr(lines: Sequence[str]) -> list[str]: ['\x1b[31mhello\x1b[0m', '\x1b[31mworld\x1b[0m'] This is useful in cases of making special editors and viewers, and is used for the - default modes (propagate_sgr=True) of :func:`wcwidth.width` and :func:`wcwidth.clip`. + default modes (propagate_sgr=True) of :func:`wcwidth.wrap` and :func:`wcwidth.clip`. When wrapping and clipping text containing SGR sequences, maybe a previous line enabled the BLUE color--if we are viewing *only* the line following, we would want the carry over the BLUE color, diff --git a/wcwidth/table_ambiguous.py b/wcwidth/table_ambiguous.py index e3dc0b1c..d2fdd6b8 100644 --- a/wcwidth/table_ambiguous.py +++ b/wcwidth/table_ambiguous.py @@ -1,7 +1,7 @@ """ Exports AMBIGUOUS_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-18 23:27:15 UTC. +This code generated by python wcwidth project. """ # pylint: disable=duplicate-code AMBIGUOUS_EASTASIAN = { diff --git a/wcwidth/table_grapheme.py b/wcwidth/table_grapheme.py index 42fd19e0..563792af 100644 --- a/wcwidth/table_grapheme.py +++ b/wcwidth/table_grapheme.py @@ -4,7 +4,7 @@ This module provides lookup tables for Unicode grapheme cluster break properties as defined in UAX #29: Unicode Text Segmentation. -This code generated by wcwidth/bin/update-tables.py on 2026-01-29 23:33:42 UTC. +This code generated by python wcwidth project. """ # pylint: disable=duplicate-code diff --git a/wcwidth/table_mc.py b/wcwidth/table_mc.py index 7c2e6915..663e93b7 100644 --- a/wcwidth/table_mc.py +++ b/wcwidth/table_mc.py @@ -1,7 +1,7 @@ """ Exports CATEGORY_MC table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-29 00:47:54 UTC. +This code generated by python wcwidth project. """ # pylint: disable=duplicate-code CATEGORY_MC = { diff --git a/wcwidth/table_vs16.py b/wcwidth/table_vs16.py index 70e4a737..9420156e 100644 --- a/wcwidth/table_vs16.py +++ b/wcwidth/table_vs16.py @@ -1,7 +1,7 @@ """ Exports VS16_NARROW_TO_WIDE table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2025-09-15 16:57:50 UTC. +This code generated by python wcwidth project. """ # pylint: disable=duplicate-code VS16_NARROW_TO_WIDE = { diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py index ed6f48a7..4ad7bc1e 100644 --- a/wcwidth/table_wide.py +++ b/wcwidth/table_wide.py @@ -1,7 +1,7 @@ """ Exports WIDE_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:58:17 UTC. +This code generated by python wcwidth project. """ # pylint: disable=duplicate-code WIDE_EASTASIAN = { @@ -72,8 +72,8 @@ (0x0ffe0, 0x0ffe6,), # Fullwidth Cent Sign ..Fullwidth Won Sign (0x16fe0, 0x16fe3,), # Tangut Iteration Mark ..Old Chinese Iteration Ma (0x16ff2, 0x16ff6,), # Chinese Small Simplified..Yangqin Sign Slow Two Be - (0x17000, 0x18cd5,), # (nil) ..Khitan Small Script Char - (0x18cff, 0x18d1e,), # Khitan Small Script Char..(nil) + (0x17000, 0x18cd5,), # Tangut Ideograph-17000 ..Khitan Small Script Char + (0x18cff, 0x18d1e,), # Khitan Small Script Char..Tangut Ideograph-18d1e (0x18d80, 0x18df2,), # Tangut Component-769 ..Tangut Component-883 (0x1aff0, 0x1aff3,), # Katakana Letter Minnan T..Katakana Letter Minnan T (0x1aff5, 0x1affb,), # Katakana Letter Minnan T..Katakana Letter Minnan N diff --git a/wcwidth/table_zero.py b/wcwidth/table_zero.py index c440bfcf..bee2431a 100644 --- a/wcwidth/table_zero.py +++ b/wcwidth/table_zero.py @@ -1,7 +1,7 @@ """ Exports ZERO_WIDTH table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2026-01-30 00:48:24 UTC. +This code generated by python wcwidth project. """ # pylint: disable=duplicate-code ZERO_WIDTH = { diff --git a/wcwidth/textwrap.py b/wcwidth/textwrap.py index 4582cd5e..02cc31df 100644 --- a/wcwidth/textwrap.py +++ b/wcwidth/textwrap.py @@ -4,55 +4,26 @@ This module provides functions for wrapping text that may contain terminal escape sequences, with proper handling of Unicode grapheme clusters and character display widths. """ + from __future__ import annotations # std imports -import re import secrets import textwrap -from typing import TYPE_CHECKING, NamedTuple +from typing import TYPE_CHECKING, Optional # local -from .wcwidth import width as _width -from .wcwidth import iter_sequences +from ._width import width as wcwidth_width from .grapheme import iter_graphemes +from .hyperlink import HyperlinkParams from .sgr_state import propagate_sgr as _propagate_sgr -from .escape_sequences import ZERO_WIDTH_PATTERN +from .escape_sequences import ZERO_WIDTH_PATTERN, iter_sequences if TYPE_CHECKING: # pragma: no cover from typing import Any, Literal -class _HyperlinkState(NamedTuple): - """State for tracking an open OSC 8 hyperlink across line breaks.""" - - url: str # hyperlink target URL - params: str # id=xxx and other key=value pairs separated by : - terminator: str # BEL (\x07) or ST (\x1b\\) - - -# Hyperlink parsing: captures (params, url, terminator) -_HYPERLINK_OPEN_RE = re.compile(r'\x1b]8;([^;]*);([^\x07\x1b]*)(\x07|\x1b\\)') - - -def _parse_hyperlink_open(seq: str) -> _HyperlinkState | None: - """Parse OSC 8 open sequence, return state or None.""" - if (m := _HYPERLINK_OPEN_RE.match(seq)): - return _HyperlinkState(url=m.group(2), params=m.group(1), terminator=m.group(3)) - return None - - -def _make_hyperlink_open(url: str, params: str, terminator: str) -> str: - """Generate OSC 8 open sequence.""" - return f'\x1b]8;{params};{url}{terminator}' - - -def _make_hyperlink_close(terminator: str) -> str: - """Generate OSC 8 close sequence.""" - return f'\x1b]8;;{terminator}' - - class SequenceTextWrapper(textwrap.TextWrapper): """ Sequence-aware text wrapper extending :class:`textwrap.TextWrapper`. @@ -99,8 +70,8 @@ def _next_hyperlink_id() -> str: def _width(self, text: str) -> int: """Measure text width accounting for sequences.""" - return _width(text, control_codes=self.control_codes, tabsize=self.tabsize, - ambiguous_width=self.ambiguous_width) + return wcwidth_width(text, control_codes=self.control_codes, tabsize=self.tabsize, + ambiguous_width=self.ambiguous_width) def _strip_sequences(self, text: str) -> str: """Strip all terminal sequences from text.""" @@ -241,9 +212,9 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m lines: list[str] = [] is_first_line = True - hyperlink_state: _HyperlinkState | None = None + hyperlink_state: Optional[HyperlinkParams] = None # Track the id we're using for the current hyperlink continuation - current_hyperlink_id: str | None = None + current_hyperlink_id: Optional[str] = None # Arrange in reverse order so items can be efficiently popped chunks = list(reversed(chunks)) @@ -258,8 +229,11 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m # If continuing a hyperlink from previous line, prepend open sequence if hyperlink_state is not None: - open_seq = _make_hyperlink_open( - hyperlink_state.url, hyperlink_state.params, hyperlink_state.terminator) + open_seq = HyperlinkParams( + url=hyperlink_state.url, + params=hyperlink_state.params, + terminator=hyperlink_state.terminator, + ).make_open() chunks[-1] = open_seq + chunks[-1] # Drop leading whitespace (except at very start) @@ -332,26 +306,33 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m if 'id=' in new_state.params: current_hyperlink_id = new_state.params elif new_state.params: - # Prepend id to existing params (per OSC 8 spec, params can have - # multiple key=value pairs separated by :) + # Prepend id to existing params. Per OSC 8 spec, params can have + # multiple key=value pairs separated by ':'. current_hyperlink_id = ( f'id={self._next_hyperlink_id()}:{new_state.params}') else: current_hyperlink_id = f'id={self._next_hyperlink_id()}' - line_content += _make_hyperlink_close(new_state.terminator) + line_content += HyperlinkParams( + terminator=new_state.terminator, url='').make_close() # Also need to inject the id into the opening # sequence if it didn't have one if 'id=' not in new_state.params: # Find and replace the original open sequence with one that has id - old_open = _make_hyperlink_open( - new_state.url, new_state.params, new_state.terminator) - new_open = _make_hyperlink_open( - new_state.url, current_hyperlink_id, new_state.terminator) + old_open = HyperlinkParams( + url=new_state.url, + params=new_state.params, + terminator=new_state.terminator, + ).make_open() + new_open = HyperlinkParams( + url=new_state.url, + params=current_hyperlink_id, + terminator=new_state.terminator, + ).make_open() line_content = line_content.replace(old_open, new_open, 1) # Update state for next line, using computed id - hyperlink_state = _HyperlinkState( + hyperlink_state = HyperlinkParams( new_state.url, current_hyperlink_id, new_state.terminator) else: hyperlink_state = None @@ -364,7 +345,7 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m lines.append(indent + line_content) is_first_line = False else: - # max_lines reached with remaining content — + # max_lines reached with remaining content. # pop chunks until placeholder fits, then break. placeholder_w = self._width(self.placeholder) while current_line: @@ -375,8 +356,8 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m new_state = self._track_hyperlink_state( line_content, hyperlink_state) if new_state is not None: - line_content += _make_hyperlink_close( - new_state.terminator) + line_content += HyperlinkParams( + terminator=new_state.terminator, url='').make_close() lines.append(indent + line_content + self.placeholder) break current_width -= self._width(current_line[-1]) @@ -395,7 +376,7 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-m def _track_hyperlink_state( self, text: str, - state: _HyperlinkState | None) -> _HyperlinkState | None: + state: Optional[HyperlinkParams]) -> Optional[HyperlinkParams]: """ Track hyperlink state through text. @@ -405,7 +386,7 @@ def _track_hyperlink_state( """ for segment, is_seq in iter_sequences(text): if is_seq: - parsed_link = _parse_hyperlink_open(segment) + parsed_link = HyperlinkParams.parse(segment) if parsed_link is not None and parsed_link.url: # has URL = open state = parsed_link elif segment.startswith(('\x1b]8;;\x1b\\', '\x1b]8;;\x07')): # close @@ -545,7 +526,7 @@ def wrap(text: str, width: int = 70, *, break_long_words: bool = True, break_on_hyphens: bool = True, drop_whitespace: bool = True, - max_lines: int | None = None, + max_lines: Optional[int] = None, placeholder: str = ' [...]', propagate_sgr: bool = True) -> list[str]: r""" diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index f6edf5fe..e4895e99 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -1,82 +1,38 @@ """ -This is a python implementation of wcwidth() and wcswidth(). +Legacy compatibility module for wcwidth.wcwidth. -https://github.com/jquast/wcwidth +This file contains no new definitions and is provided only for backwards +compatibility. This module exists solely to support legacy import paths:: -from Markus Kuhn's C code, retrieved from: - - http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - -This is an implementation of wcwidth() and wcswidth() (defined in -IEEE Std 1002.1-2001) for Unicode. - -http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html -http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html - -In fixed-width output devices, Latin characters all occupy a single -"cell" position of equal width, whereas ideographic CJK characters -occupy two such cells. Interoperability between terminal-line -applications and (teletype-style) character terminals using the -UTF-8 encoding requires agreement on which character should advance -the cursor by how many cell positions. No established formal -standards exist at present on which Unicode character shall occupy -how many cell positions on character terminals. These routines are -a first attempt of defining such behavior based on simple rules -applied to data provided by the Unicode Consortium. - -For some graphical characters, the Unicode standard explicitly -defines a character-cell width via the definition of the East Asian -FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. -In all these cases, there is no ambiguity about which width a -terminal shall use. For characters in the East Asian Ambiguous (A) -class, the width choice depends purely on a preference of backward -compatibility with either historic CJK or Western practice. -Choosing single-width for these characters is easy to justify as -the appropriate long-term solution, as the CJK practice of -displaying these characters as double-width comes from historic -implementation simplicity (8-bit encoded characters were displayed -single-width and 16-bit ones double-width, even for Greek, -Cyrillic, etc.) and not any typographic considerations. - -Much less clear is the choice of width for the Not East Asian -(Neutral) class. Existing practice does not dictate a width for any -of these characters. It would nevertheless make sense -typographically to allocate two character cells to characters such -as for instance EM SPACE or VOLUME INTEGRAL, which cannot be -represented adequately with a single-width glyph. The following -routines at present merely assign a single-cell width to all -neutral characters, in the interest of simplicity. This is not -entirely satisfactory and should be reconsidered before -establishing a formal standard in this area. At the moment, the -decision which Not East Asian (Neutral) characters should be -represented by double-width glyphs cannot yet be answered by -applying a simple rule from the Unicode database content. Setting -up a proper standard for the behavior of UTF-8 character terminals -will require a careful analysis not only of each Unicode character, -but also of each presentation form, something the author of these -routines has avoided to do so far. - -http://www.unicode.org/unicode/reports/tr11/ - -Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c + from wcwidth.wcwidth import iter_graphemes + from wcwidth.wcwidth import _SGR_PATTERN + import wcwidth.wcwidth as legacy """ - -from __future__ import annotations - -# std imports -from functools import lru_cache - -from typing import TYPE_CHECKING +# pylint: disable=unused-import # local +from ._clip import clip +from .align import ljust, rjust, center +from ._width import _CONTROL_CHAR_TABLE, _WIDTH_FAST_PATH_MIN_LEN, width, _width_ignored_codes +from ._wcwidth import wcwidth, _wcmatch_version, _wcversion_value from .bisearch import bisearch as _bisearch from .grapheme import iter_graphemes from .table_mc import CATEGORY_MC +from ._wcswidth import wcswidth from .sgr_state import (_SGR_PATTERN, _SGR_STATE_DEFAULT, _sgr_state_update, _sgr_state_is_active, _sgr_state_to_sequence) +from ._constants import (_EMOJI_ZWJ_SET, + _ISC_VIRAMA_SET, + _LATEST_VERSION, + _AMBIGUOUS_TABLE, + _ZERO_WIDTH_TABLE, + _CATEGORY_MC_TABLE, + _FITZPATRICK_RANGE, + _WIDE_EASTASIAN_TABLE, + _REGIONAL_INDICATOR_SET) from .table_vs16 import VS16_NARROW_TO_WIDE from .table_wide import WIDE_EASTASIAN from .table_zero import ZERO_WIDTH @@ -86,78 +42,13 @@ from .escape_sequences import (ZERO_WIDTH_PATTERN, CURSOR_LEFT_SEQUENCE, CURSOR_RIGHT_SEQUENCE, - INDETERMINATE_EFFECT_SEQUENCE) + INDETERMINATE_EFFECT_SEQUENCE, + iter_sequences, + strip_sequences) from .unicode_versions import list_versions -if TYPE_CHECKING: # pragma: no cover - # std imports - from collections.abc import Iterator - - from typing import Literal - -# Pre-compute table references for the latest (and only) Unicode version. -_LATEST_VERSION = list_versions()[-1] -_ZERO_WIDTH_TABLE = ZERO_WIDTH[_LATEST_VERSION] -_WIDE_EASTASIAN_TABLE = WIDE_EASTASIAN[_LATEST_VERSION] -_AMBIGUOUS_TABLE = AMBIGUOUS_EASTASIAN[next(iter(AMBIGUOUS_EASTASIAN))] -_CATEGORY_MC_TABLE = CATEGORY_MC[_LATEST_VERSION] -_REGIONAL_INDICATOR_SET = frozenset( - range(GRAPHEME_REGIONAL_INDICATOR[0][0], GRAPHEME_REGIONAL_INDICATOR[0][1] + 1) -) -_EMOJI_ZWJ_SET = frozenset( - cp for lo, hi in EXTENDED_PICTOGRAPHIC for cp in range(lo, hi + 1) -) | _REGIONAL_INDICATOR_SET -_FITZPATRICK_RANGE = (0x1F3FB, 0x1F3FF) -# Indic_Syllabic_Category=Virama codepoints, from IndicSyllabicCategory.txt. -# These are structurally tied to their scripts and not expected to change. -# https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt -_ISC_VIRAMA_SET = frozenset(( - 0x094D, # DEVANAGARI SIGN VIRAMA - 0x09CD, # BENGALI SIGN VIRAMA - 0x0A4D, # GURMUKHI SIGN VIRAMA - 0x0ACD, # GUJARATI SIGN VIRAMA - 0x0B4D, # ORIYA SIGN VIRAMA - 0x0BCD, # TAMIL SIGN VIRAMA - 0x0C4D, # TELUGU SIGN VIRAMA - 0x0CCD, # KANNADA SIGN VIRAMA - 0x0D4D, # MALAYALAM SIGN VIRAMA - 0x0DCA, # SINHALA SIGN AL-LAKUNA - 0x1B44, # BALINESE ADEG ADEG - 0xA806, # SYLOTI NAGRI SIGN HASANTA - 0xA8C4, # SAURASHTRA SIGN VIRAMA - 0xA9C0, # JAVANESE PANGKON - 0x11046, # BRAHMI VIRAMA - 0x110B9, # KAITHI SIGN VIRAMA - 0x111C0, # SHARADA SIGN VIRAMA - 0x11235, # KHOJKI SIGN VIRAMA - 0x1134D, # GRANTHA SIGN VIRAMA - 0x11442, # NEWA SIGN VIRAMA - 0x114C2, # TIRHUTA SIGN VIRAMA - 0x115BF, # SIDDHAM SIGN VIRAMA - 0x1163F, # MODI SIGN VIRAMA - 0x116B6, # TAKRI SIGN VIRAMA - 0x11839, # DOGRA SIGN VIRAMA - 0x119E0, # NANDINAGARI SIGN VIRAMA - 0x11C3F, # BHAIKSUKI SIGN VIRAMA -)) _ISC_CONSONANT_TABLE = ISC_CONSONANT -# In 'parse' mode, strings longer than this are checked for cursor-movement -# controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to -# 'ignore' to skip character-by-character parsing. The detection scan cost is -# negligible for long strings but wasted on short ones like labels or headings. -_WIDTH_FAST_PATH_MIN_LEN = 20 - -# Translation table to strip C0/C1 control characters for fast 'ignore' mode. -_CONTROL_CHAR_TABLE = str.maketrans('', '', ( - ''.join(chr(c) for c in range(0x00, 0x20)) + # C0: NUL through US (including tab) - '\x7f' + # DEL - ''.join(chr(c) for c in range(0x80, 0xa0)) # C1: U+0080-U+009F -)) - -# Unlike wcwidth.__all__, wcwidth.wcwidth.__all__ is NOT for the purpose of defining a public API, -# or what we prefer to be imported with statement, "from wcwidth.wcwidth import *". Explicitly -# re-export imports here for no other reason than to satisfy the type checkers (mypy). Yak shavings. __all__ = ( 'ZERO_WIDTH', 'WIDE_EASTASIAN', @@ -176,851 +67,3 @@ '_wcmatch_version', '_wcversion_value', ) - - -# maxsize=1024: western scripts need ~64 unique codepoints per session, but -# CJK sessions may use ~2000 of ~3500 common hanzi/kanji. 1024 accommodates -# heavy CJK use. Performance floor at 32; bisearch is ~100ns per miss. - -@lru_cache(maxsize=1024) -def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> int: # pylint: disable=unused-argument - r""" - Given one Unicode codepoint, return its printable length on a terminal. - - :param wc: A single Unicode character. - :param unicode_version: Ignored. Retained for backwards compatibility. - - .. deprecated:: 0.3.0 - Only the latest Unicode version is now shipped. - - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts - where ambiguous characters display as double-width. See - :ref:`ambiguous_width` for details. - :returns: The width, in cells, necessary to display the character of - Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has - no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is - not printable, or has an indeterminate effect on the terminal, such as - a control character. Otherwise, the number of column positions the - character occupies on a graphic terminal (1 or 2) is returned. - - See :ref:`Specification` for details of cell measurement. - """ - ucs = ord(wc) if wc else 0 - - # small optimization: early return of 1 for printable ASCII, this provides - # approximately 40% performance improvement for mostly-ascii documents, with - # less than 1% impact to others. - if 32 <= ucs < 0x7f: - return 1 - - # C0/C1 control characters are -1 for compatibility with POSIX-like calls - if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0: - return -1 - - # Zero width - if _bisearch(ucs, _ZERO_WIDTH_TABLE): - return 0 - - # Wide (F/W categories) - if _bisearch(ucs, _WIDE_EASTASIAN_TABLE): - return 2 - - # Ambiguous width (A category) - only when ambiguous_width=2 - if ambiguous_width == 2 and _bisearch(ucs, _AMBIGUOUS_TABLE): - return 2 - - return 1 - - -def wcswidth( - pwcs: str, - n: int | None = None, - unicode_version: str = 'auto', - ambiguous_width: int = 1, -) -> int: - """ - Given a unicode string, return its printable length on a terminal. - - :param pwcs: Measure width of given unicode string. - :param n: When ``n`` is None (default), return the length of the entire - string, otherwise only the first ``n`` characters are measured. - - :param unicode_version: Ignored. Retained for backwards compatibility. - - .. deprecated:: 0.3.0 - Only the latest Unicode version is now shipped. - - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :returns: The width, in cells, needed to display the first ``n`` characters - of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control - characters! - - See :ref:`Specification` for details of cell measurement. - """ - # pylint: disable=unused-argument,too-many-locals,too-many-statements - # pylint: disable=too-complex,too-many-branches - # This function intentionally kept long without delegating functions to reduce function calls in - # "hot path", the overhead per-character adds up. - - # Fast path: pure ASCII printable strings are always width == length - if n is None and pwcs.isascii() and pwcs.isprintable(): - return len(pwcs) - - # Select wcwidth call pattern for best lru_cache performance: - # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls - # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) - _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) - - end = len(pwcs) if n is None else n - total_width = 0 - idx = 0 - last_measured_idx = -2 # Track index of last measured char for VS16 - last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check) - last_was_virama = False # Virama conjunct formation state - conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc) - while idx < end: - char = pwcs[idx] - ucs = ord(char) - if ucs == 0x200D: - if last_was_virama: - # ZWJ after virama requests explicit half-form rendering but - # does not change cell count — consume ZWJ only, let the next - # consonant be handled by the virama conjunct rule. - idx += 1 - elif idx + 1 < end: - # Emoji ZWJ: skip next character unconditionally. - idx += 2 - last_was_virama = False - else: - idx += 1 - last_was_virama = False - continue - if ucs == 0xFE0F and last_measured_idx >= 0: - # VS16 following a measured character: add 1 if that character is - # known to be converted from narrow to wide by VS16. - total_width += _bisearch(ord(pwcs[last_measured_idx]), - VS16_NARROW_TO_WIDE["9.0.0"]) - last_measured_idx = -2 # Prevent double application - # VS16 preserves emoji context: last_measured_ucs stays as the base - idx += 1 - continue - # Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+) - if ucs > 0xFFFF: - if ucs in _REGIONAL_INDICATOR_SET: - # Lazy RI pairing: count preceding consecutive RIs only when the last one is - # received, because RI's are received so rarely its better than per-loop tracking of - # 'last char was an RI'. - ri_before = 0 - j = idx - 1 - while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET: - ri_before += 1 - j -= 1 - if ri_before % 2 == 1: - # Second RI in pair: contributes 0 (pair = one 2-cell flag) using an even-or-odd - # check to determine, 'CAUS' would be two flags, but 'CAU' would be 1 flag - # and wide 'U'. - idx += 1 - last_measured_ucs = ucs - continue - # First or unpaired RI: measured normally (width 2 from table) - # Fitzpatrick modifier: zero-width when following emoji base - elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] - and last_measured_ucs in _EMOJI_ZWJ_SET): - idx += 1 - continue - # Virama conjunct formation: consonant following virama contributes 0 width. - # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category - if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE): - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - conjunct_pending = True - idx += 1 - continue - wcw = _wcwidth(char) - if wcw < 0: - # early return -1 on C0 and C1 control characters - return wcw - if wcw > 0: - if conjunct_pending: - total_width += 1 - conjunct_pending = False - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE): - # Spacing Combining Mark (Mc) following a base character adds 1 - wcw = 1 - last_measured_idx = -2 - last_was_virama = False - conjunct_pending = False - else: - last_was_virama = ucs in _ISC_VIRAMA_SET - total_width += wcw - idx += 1 - if conjunct_pending: - total_width += 1 - return total_width - - -# NOTE: _wcversion_value and _wcmatch_version are no longer used internally -# by wcwidth since version 0.5.0 (only the latest Unicode version is shipped). -# -# They are retained for API compatibility with external tools like ucs-detect -# that may use these private functions. - - -@lru_cache(maxsize=128) -def _wcversion_value(ver_string: str) -> tuple[int, ...]: # pragma: no cover - """ - Integer-mapped value of given dotted version string. - - .. deprecated:: 0.3.0 - - This function is no longer used internally by wcwidth but is retained - for API compatibility with external tools. - - :param ver_string: Unicode version string, of form ``n.n.n``. - :returns: tuple of digit tuples, ``tuple(int, [...])``. - """ - retval = tuple(map(int, (ver_string.split('.')))) - return retval - - -@lru_cache(maxsize=8) -def _wcmatch_version(given_version: str) -> str: # pylint: disable=unused-argument - """ - Return the supported Unicode version level. - - .. deprecated:: 0.3.0 - This function now always returns the latest version. - - This function is no longer used internally by wcwidth but is retained - for API compatibility with external tools. - - :param given_version: Ignored. Any value is accepted for compatibility. - :returns: The latest unicode version string. - """ - return _LATEST_VERSION - - -def iter_sequences(text: str) -> Iterator[tuple[str, bool]]: - r""" - Iterate through text, yielding segments with sequence identification. - - This generator yields tuples of ``(segment, is_sequence)`` for each part - of the input text, where ``is_sequence`` is ``True`` if the segment is - a recognized terminal escape sequence. - - :param text: String to iterate through. - :returns: Iterator of (segment, is_sequence) tuples. - - .. versionadded:: 0.3.0 - - Example:: - - >>> list(iter_sequences('hello')) - [('hello', False)] - >>> list(iter_sequences('\x1b[31mred')) - [('\x1b[31m', True), ('red', False)] - >>> list(iter_sequences('\x1b[1m\x1b[31m')) - [('\x1b[1m', True), ('\x1b[31m', True)] - """ - idx = 0 - text_len = len(text) - segment_start = 0 - - while idx < text_len: - char = text[idx] - - if char == '\x1b': - # Yield any accumulated non-sequence text - if idx > segment_start: - yield (text[segment_start:idx], False) - - # Try to match an escape sequence - match = ZERO_WIDTH_PATTERN.match(text, idx) - if match: - yield (match.group(), True) - idx = match.end() - else: - # Lone ESC or unrecognized - yield as sequence anyway - yield (char, True) - idx += 1 - segment_start = idx - else: - idx += 1 - - # Yield any remaining text - if segment_start < text_len: - yield (text[segment_start:], False) - - -def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int: - """ - Fast path for width() with control_codes='ignore'. - - Strips escape sequences and control characters, then measures remaining text. - """ - return wcswidth( - strip_sequences(text).translate(_CONTROL_CHAR_TABLE), - ambiguous_width=ambiguous_width - ) - - -def width( - text: str, - *, - control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', - tabsize: int = 8, - ambiguous_width: int = 1, -) -> int: - r""" - Return printable width of text containing many kinds of control codes and sequences. - - Unlike :func:`wcswidth`, this function handles most control characters and many popular terminal - output sequences. Never returns -1. - - :param text: String to measure. - :param control_codes: How to handle control characters and sequences: - - - ``'parse'`` (default): Track horizontal cursor movement from BS ``\b``, CR ``\r``, TAB - ``\t``, and cursor left and right movement sequences. Vertical movement (LF, VT, FF) and - indeterminate sequences are zero-width. Never raises. - - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with - indeterminate results of the screen or cursor, like clear or vertical movement. Generally, - these should be handled with a virtual terminal emulator (like 'pyte'). - - ``'ignore'``: All C0 and C1 control characters and escape sequences are measured as - width 0. This is the fastest measurement for text already filtered or known not to contain - any kinds of control codes or sequences. TAB ``\t`` is zero-width; for tab expansion, - pre-process: ``text.replace('\t', ' ' * 8)``. - - :param tabsize: Tab stop width for ``'parse'`` and ``'strict'`` modes. Default is 8. - Must be positive. Has no effect when ``control_codes='ignore'``. - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :returns: Maximum cursor position reached, "extent", accounting for cursor movement sequences - present in ``text`` according to given parameters. This represents the rightmost column the - cursor reaches. Always a non-negative integer. - - :raises ValueError: If ``control_codes='strict'`` and control characters with indeterminate - effects, such as vertical movement or clear sequences are encountered, or on unexpected - C0 or C1 control code. Also raised when ``control_codes`` is not one of the valid values. - - .. versionadded:: 0.3.0 - - Examples:: - - >>> width('hello') - 5 - >>> width('コンニチハ') - 10 - >>> width('\x1b[31mred\x1b[0m') - 3 - >>> width('\x1b[31mred\x1b[0m', control_codes='ignore') # same result (ignored) - 3 - >>> width('123\b4') # backspace overwrites previous cell (outputs '124') - 3 - >>> width('abc\t') # tab caused cursor to move to column 8 - 8 - >>> width('1\x1b[10C') # '1' + cursor right 10, cursor ends on column 11 - 11 - >>> width('1\x1b[10C', control_codes='ignore') # faster but wrong in this case - 1 - """ - # pylint: disable=too-complex,too-many-branches,too-many-statements,too-many-locals - # This could be broken into sub-functions (#1, #3, and 6 especially), but for reduced overhead - # considering this function is a likely "hot path", they are inlined, breaking many of our - # complexity rules. - - # Fast path for ASCII printable (no tabs, escapes, or control chars) - if text.isascii() and text.isprintable(): - return len(text) - - # Fast parse: if no horizontal cursor movements are possible, switch to 'ignore' mode. - # Only check for longer strings - the detection overhead hurts short string performance. - if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN: - # Check for cursor-affecting control characters - if '\b' not in text and '\t' not in text and '\r' not in text: - # Check for escape sequences - if none, or only non-cursor-movement sequences - if '\x1b' not in text or ( - not CURSOR_RIGHT_SEQUENCE.search(text) and - not CURSOR_LEFT_SEQUENCE.search(text) - ): - control_codes = 'ignore' - - # Fast path for ignore mode -- this is useful if you know the text is already "clean" - if control_codes == 'ignore': - return _width_ignored_codes(text, ambiguous_width) - - strict = control_codes == 'strict' - # Track absolute positions: tab stops need modulo on absolute column, CR resets to 0. - # Initialize max_extent to 0 so backward movement (CR, BS) won't yield negative width. - current_col = 0 - max_extent = 0 - idx = 0 - last_measured_idx = -2 # Track index of last measured char for VS16; -2 can never match idx-1 - last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check) - last_was_virama = False # Virama conjunct formation state - conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc) - text_len = len(text) - - # Select wcwidth call pattern for best lru_cache performance: - # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls - # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) - _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) - - while idx < text_len: - char = text[idx] - - # 1. Handle ESC sequences - if char == '\x1b': - match = ZERO_WIDTH_PATTERN.match(text, idx) - if match: - seq = match.group() - if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): - raise ValueError(f"Indeterminate cursor sequence at position {idx}") - # Apply cursor movement - right = CURSOR_RIGHT_SEQUENCE.match(seq) - if right: - current_col += int(right.group(1) or 1) - else: - left = CURSOR_LEFT_SEQUENCE.match(seq) - if left: - current_col = max(0, current_col - int(left.group(1) or 1)) - idx = match.end() - else: - idx += 1 - max_extent = max(max_extent, current_col) - continue - - # 2. Handle illegal and vertical control characters (zero width, error in strict) - if char in ILLEGAL_CTRL: - if strict: - raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}") - idx += 1 - continue - - if char in VERTICAL_CTRL: - if strict: - raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}") - idx += 1 - continue - - # 3. Handle horizontal movement characters - if char in HORIZONTAL_CTRL: - if char == '\x09' and tabsize > 0: # Tab - current_col += tabsize - (current_col % tabsize) - elif char == '\x08': # Backspace - if current_col > 0: - current_col -= 1 - elif char == '\x0d': # Carriage return - current_col = 0 - max_extent = max(max_extent, current_col) - idx += 1 - continue - - # 4. Handle ZWJ - if char == '\u200D': - if last_was_virama: - # ZWJ after virama requests explicit half-form rendering but - # does not change cell count — consume ZWJ only, let the next - # consonant be handled by the virama conjunct rule. - idx += 1 - elif idx + 1 < text_len: - # Emoji ZWJ: skip next character unconditionally. - idx += 2 - last_was_virama = False - else: - idx += 1 - last_was_virama = False - continue - - # 5. Handle other zero-width characters (control chars) - if char in ZERO_WIDTH_CTRL: - idx += 1 - continue - - ucs = ord(char) - - # 6. Handle VS16: converts preceding narrow character to wide - if ucs == 0xFE0F: - if last_measured_idx == idx - 1: - if _bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE["9.0.0"]): - current_col += 1 - max_extent = max(max_extent, current_col) - # VS16 preserves emoji context: last_measured_ucs stays as the base - idx += 1 - continue - - # 6b. Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+) - if ucs > 0xFFFF: - if ucs in _REGIONAL_INDICATOR_SET: - # Lazy RI pairing: count preceding consecutive RIs - ri_before = 0 - j = idx - 1 - while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET: - ri_before += 1 - j -= 1 - if ri_before % 2 == 1: - last_measured_ucs = ucs - idx += 1 - continue - # 6c. Fitzpatrick modifier: zero-width when following emoji base - elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] - and last_measured_ucs in _EMOJI_ZWJ_SET): - idx += 1 - continue - - # 7. Virama conjunct formation: consonant following virama contributes 0 width. - # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category - if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE): - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - conjunct_pending = True - idx += 1 - continue - - # 8. Normal characters: measure with wcwidth - w = _wcwidth(char) - if w > 0: - if conjunct_pending: - current_col += 1 - conjunct_pending = False - current_col += w - max_extent = max(max_extent, current_col) - last_measured_idx = idx - last_measured_ucs = ucs - last_was_virama = False - elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE): - # Spacing Combining Mark (Mc) following a base character adds 1 - current_col += 1 - max_extent = max(max_extent, current_col) - last_measured_idx = -2 - last_was_virama = False - conjunct_pending = False - else: - last_was_virama = ucs in _ISC_VIRAMA_SET - idx += 1 - - if conjunct_pending: - current_col += 1 - max_extent = max(max_extent, current_col) - return max_extent - - -def ljust( - text: str, - dest_width: int, - fillchar: str = ' ', - *, - control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', - ambiguous_width: int = 1, -) -> str: - r""" - Return text left-justified in a string of given display width. - - :param text: String to justify, may contain terminal sequences. - :param dest_width: Total display width of result in terminal cells. - :param fillchar: Single character for padding (default space). Must have - display width of 1 (not wide, not zero-width, not combining). Unicode - characters like ``'·'`` are acceptable. The width is not validated. - :param control_codes: How to handle control sequences when measuring. - Passed to :func:`width` for measurement. - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :returns: Text padded on the right to reach ``dest_width``. - - .. versionadded:: 0.3.0 - - Example:: - - >>> wcwidth.ljust('hi', 5) - 'hi ' - >>> wcwidth.ljust('\x1b[31mhi\x1b[0m', 5) - '\x1b[31mhi\x1b[0m ' - >>> wcwidth.ljust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) - '👨‍👩‍👧 ' - """ - if text.isascii() and text.isprintable(): - text_width = len(text) - else: - text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) - padding_cells = max(0, dest_width - text_width) - return text + fillchar * padding_cells - - -def rjust( - text: str, - dest_width: int, - fillchar: str = ' ', - *, - control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', - ambiguous_width: int = 1, -) -> str: - r""" - Return text right-justified in a string of given display width. - - :param text: String to justify, may contain terminal sequences. - :param dest_width: Total display width of result in terminal cells. - :param fillchar: Single character for padding (default space). Must have - display width of 1 (not wide, not zero-width, not combining). Unicode - characters like ``'·'`` are acceptable. The width is not validated. - :param control_codes: How to handle control sequences when measuring. - Passed to :func:`width` for measurement. - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :returns: Text padded on the left to reach ``dest_width``. - - .. versionadded:: 0.3.0 - - Example:: - - >>> wcwidth.rjust('hi', 5) - ' hi' - >>> wcwidth.rjust('\x1b[31mhi\x1b[0m', 5) - ' \x1b[31mhi\x1b[0m' - >>> wcwidth.rjust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) - ' 👨‍👩‍👧' - """ - if text.isascii() and text.isprintable(): - text_width = len(text) - else: - text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) - padding_cells = max(0, dest_width - text_width) - return fillchar * padding_cells + text - - -def center( - text: str, - dest_width: int, - fillchar: str = ' ', - *, - control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', - ambiguous_width: int = 1, -) -> str: - r""" - Return text centered in a string of given display width. - - :param text: String to center, may contain terminal sequences. - :param dest_width: Total display width of result in terminal cells. - :param fillchar: Single character for padding (default space). Must have - display width of 1 (not wide, not zero-width, not combining). Unicode - characters like ``'·'`` are acceptable. The width is not validated. - :param control_codes: How to handle control sequences when measuring. - Passed to :func:`width` for measurement. - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :returns: Text padded on both sides to reach ``dest_width``. - - For odd-width padding, the extra cell goes on the right (matching - Python's :meth:`str.center` behavior). - - .. versionadded:: 0.3.0 - - Example:: - - >>> wcwidth.center('hi', 6) - ' hi ' - >>> wcwidth.center('\x1b[31mhi\x1b[0m', 6) - ' \x1b[31mhi\x1b[0m ' - >>> wcwidth.center('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) - ' 👨‍👩‍👧 ' - """ - if text.isascii() and text.isprintable(): - text_width = len(text) - else: - text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) - total_padding = max(0, dest_width - text_width) - # matching https://jazcap53.github.io/pythons-eccentric-strcenter.html - left_pad = total_padding // 2 + (total_padding & dest_width & 1) - right_pad = total_padding - left_pad - return fillchar * left_pad + text + fillchar * right_pad - - -def strip_sequences(text: str) -> str: - r""" - Return text with all terminal escape sequences removed. - - Unknown or incomplete ESC sequences are preserved. - - :param text: String that may contain terminal escape sequences. - :returns: The input text with all escape sequences stripped. - - .. versionadded:: 0.3.0 - - Example:: - - >>> strip_sequences('\x1b[31mred\x1b[0m') - 'red' - >>> strip_sequences('hello') - 'hello' - >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text') - 'bold red text' - """ - return ZERO_WIDTH_PATTERN.sub('', text) - - -def clip( - text: str, - start: int, - end: int, - *, - fillchar: str = ' ', - tabsize: int = 8, - ambiguous_width: int = 1, - propagate_sgr: bool = True, -) -> str: - r""" - Clip text to display columns ``(start, end)`` while preserving all terminal sequences. - - This function extracts a substring based on visible column positions rather than - character indices. Terminal escape sequences are preserved in the output since - they have zero display width. If a wide character (width 2) would be split at - either boundary, it is replaced with ``fillchar``. - - TAB characters (``\t``) are expanded to spaces up to the next tab stop, - controlled by the ``tabsize`` parameter. - - Other cursor movement characters (backspace, carriage return) and cursor - movement sequences are passed through unchanged as zero-width. - - :param text: String to clip, may contain terminal escape sequences. - :param start: Absolute starting column (inclusive, 0-indexed). - :param end: Absolute ending column (exclusive). - :param fillchar: Character to use when a wide character must be split at - a boundary (default space). Must have display width of 1. - :param tabsize: Tab stop width (default 8). Set to 0 to pass tabs through - as zero-width (preserved in output but don't advance column position). - :param ambiguous_width: Width to use for East Asian Ambiguous (A) - characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. - :param propagate_sgr: If True (default), SGR (terminal styling) sequences - are propagated. The result begins with any active style at the start - position and ends with a reset sequence if styles are active. - :returns: Substring of ``text`` spanning display columns ``(start, end)``, - with all terminal sequences preserved and wide characters at boundaries - replaced with ``fillchar``. - - SGR (terminal styling) sequences are propagated by default. The result - begins with any active style and ends with a reset:: - - >>> clip('\x1b[1;34mHello world\x1b[0m', 6, 11) - '\x1b[1;34mworld\x1b[0m' - - Set ``propagate_sgr=False`` to disable this behavior. - - .. versionadded:: 0.3.0 - - .. versionchanged:: 0.5.0 - Added ``propagate_sgr`` parameter (default True). - - Example:: - - >>> clip('hello world', 0, 5) - 'hello' - >>> clip('中文字', 0, 3) # Wide char split at column 3 - '中 ' - >>> clip('a\tb', 0, 10) # Tab expanded to spaces - 'a b' - """ - # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements,too-many-nested-blocks - # Again, for 'hot path', we avoid additional delegate functions and accept the cost - # of complexity for improved python performance. - start = max(start, 0) - if end <= start: - return '' - - # Fast path: printable ASCII only (no tabs, escape sequences, or wide or zero-width chars) - if text.isascii() and text.isprintable(): - return text[start:end] - - # Fast path: no escape sequences means no SGR tracking needed - if propagate_sgr and '\x1b' not in text: - propagate_sgr = False - - # SGR tracking state (only when propagate_sgr=True) - sgr_at_clip_start = None # state when first visible char emitted (None = not yet) - if propagate_sgr: - sgr = _SGR_STATE_DEFAULT # current SGR state, updated by all sequences - - output: list[str] = [] - col = 0 - idx = 0 - - while idx < len(text): - char = text[idx] - - # Early exit: past visible region, SGR captured, no escape ahead - if col >= end and sgr_at_clip_start is not None and char != '\x1b': - break - - # Handle escape sequences - if char == '\x1b' and (match := ZERO_WIDTH_PATTERN.match(text, idx)): - seq = match.group() - if propagate_sgr and _SGR_PATTERN.match(seq): - # Update SGR state; will be applied as prefix when visible content starts - sgr = _sgr_state_update(sgr, seq) - else: - # Non-SGR sequences always preserved - output.append(seq) - idx = match.end() - continue - - # Handle bare ESC (not a valid sequence) - if char == '\x1b': - output.append(char) - idx += 1 - continue - - # TAB expansion - if char == '\t': - if tabsize > 0: - next_tab = col + (tabsize - (col % tabsize)) - while col < next_tab: - if start <= col < end: - output.append(' ') - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - col += 1 - else: - output.append(char) - idx += 1 - continue - - # Grapheme clustering for everything else - grapheme = next(iter_graphemes(text, start=idx)) - w = width(grapheme, ambiguous_width=ambiguous_width) - - if w == 0: - if start <= col < end: - output.append(grapheme) - elif col >= start and col + w <= end: - # Fully visible - output.append(grapheme) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - col += w - elif col < end and col + w > start: - # Partially visible (wide char at boundary) - output.append(fillchar * (min(end, col + w) - max(start, col))) - if propagate_sgr and sgr_at_clip_start is None: - sgr_at_clip_start = sgr - col += w - else: - col += w - - idx += len(grapheme) - - result = ''.join(output) - - # Apply SGR prefix/suffix - if sgr_at_clip_start is not None: - if prefix := _sgr_state_to_sequence(sgr_at_clip_start): - result = prefix + result - if _sgr_state_is_active(sgr_at_clip_start): - result += '\x1b[0m' - - return result