poezio/poopt.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185

# Copyright 2017 Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
#
# This file is part of Poezio.
#
# Poezio is free software: you can redistribute it and/or modify
# it under the terms of the zlib license. See the COPYING file.
'''This is a template module just for instruction. And poopt.'''

from typing import List, Tuple

# CFFI codepath.
from cffi import FFI

ffi = FFI()
ffi.cdef("""
    typedef long wchar_t;
    int wcwidth(wchar_t c);
""")
libc = ffi.dlopen(None)

# Cython codepath.
#cdef extern from "wchar.h":
#    ctypedef Py_UCS4 wchar_t
#    int wcwidth(wchar_t c)


# Just checking if the return value is -1.  In some (all?) implementations,
# wcwidth("😆") returns -1 while it should return 1.  In these cases, we
# return 1 instead because this is by far the most probable real value.
# Since the string is received from python, and the unicode character is
# extracted with mbrtowc(), and supposing these two compononents are not
# bugged, and since poezio’s code should never pass '\t', '\n' or their
# friends, a return value of -1 from wcwidth() is considered to be a bug in
# wcwidth() (until proven otherwise). xwcwidth() is here to work around
# this bug.
def xwcwidth(c: str) -> int:
    character = ord(c)
    res = libc.wcwidth(character)
    if res == -1 and c != '\x19':
        return 1
    return res


# cut_text: takes a string and returns a tuple of int.
#
# Each two int tuple is a line, represented by the ending position it
# (where it should be cut).  Not that this position is calculed using the
# position of the python string characters, not just the individual bytes.
#
# For example,
# poopt_cut_text("vivent les réfrigérateurs", 6);
# will return [(0, 6), (7, 10), (11, 17), (17, 22), (22, 24)], meaning that
# the lines are
# "vivent", "les", "réfrig", "érateu" and "rs"
def cut_text(string: str, width: int) -> List[Tuple[int, int]]:
    '''cut_text(text, width)

    Return a list of two-tuple, the first int is the starting position of the line and the second is its end.'''

    # The list of tuples that we return
    retlist = []

    # The start position (in the python-string) of the next line
    #: unsigned int
    start_pos = 0

    # The position of the last space seen in the current line. This is used
    # to cut on spaces instead of cutting inside words, if possible (aka if
    # there is a space)
    #: int
    last_space = -1
    # The number of columns taken by chars between start_pos and last_space
    #: size_t
    cols_until_space = 0

    # Number of columns taken to display the current line so far
    #: size_t
    columns = 0

    #: wchar_t
    #wc = 0

    # The position, considering unicode chars (aka, the position in the
    # python string). This is used to determine the position in the python
    # string at which we should cut */
    #: unsigned int
    #spos = -1

    in_special_character = False
    for spos, wc in enumerate(string):
        # Special case to skip poezio special characters that are contained
        # in the python string, but should not be counted as chars because
        # they will not be displayed. Those are the formatting chars (to
        # insert colors or things like that in the string)
        if in_special_character:
            # Skip everything until the end of this format marker, but
            # without increasing the number of columns of the current
            # line. Because these chars are not printed.
            if wc in ('u', 'a', 'i', 'b', 'o', '}'):
                in_special_character = False
            continue
        if wc == '\x19':
            in_special_character = True
            continue

        # This is one condition to end the line: an explicit \n is found
        if wc == '\n':
            spos += 1
            retlist.append((start_pos, spos))

            # And then initiate a new line
            start_pos = spos
            last_space = -1
            columns = 0
            continue

        # Get the number of columns needed to display this character. May be 0, 1 or 2
        cols = xwcwidth(wc)

        # This is the second condition to end the line: we have consumed
        # enough columns to fill a whole line
        if columns + cols > width:
            # If possible, cut on a space
            if last_space != -1:
                retlist.append((start_pos, last_space))
                start_pos = last_space + 1
                last_space = -1
                columns -= (cols_until_space + 1)
            else:
                # Otherwise, cut in the middle of a word
                retlist.append((start_pos, spos))
                start_pos = spos
                columns = 0
        # We save the position of the last space seen in this line, and the
        # number of columns we have until now. This helps us keep track of
        # the columns to count when we will use that space as a cutting
        # point, later
        if wc == ' ':
            last_space = spos
            cols_until_space = columns
        # We advanced from one char, increment spos by one and add the
        # char's columns to the line's columns
        columns += cols
    # We are at the end of the string, append the last line, not finished
    retlist.append((start_pos, spos + 1))
    return retlist


# wcswidth: An emulation of the POSIX wcswidth(3) function using xwcwidth.
def wcswidth(string: str) -> int:
    '''wcswidth(s)

    The wcswidth() function returns the number of columns needed to represent the wide-character string pointed to by s. Raise UnicodeError if an invalid unicode value is passed'''

    columns = 0
    for wc in string:
        columns += xwcwidth(wc)
    return columns


# cut_by_columns: takes a python string and a number of columns, returns a
# python string truncated to take at most that many columns
# For example cut_by_columns(n, "エメルカ") will return:
# - n == 5 -> "エメ" (which takes only 4 columns since we can't cut the
#   next character in half)
# - n == 2 -> "エ"
# - n == 1 -> ""
# - n == 42 -> "エメルカ"
# - etc
def cut_by_columns(string: str, limit: int) -> str:
    '''cut_by_columns(string, limit)

    returns a string truncated to take at most limit columns'''

    spos = 0
    columns = 0
    for wc in string:
        if columns == limit:
            break
        cols = xwcwidth(wc)
        if columns + cols > limit:
            break
        spos += 1
        columns += cols
    return string[:spos]