FTXUI  3.0.0
C++ functional terminal UI.
Loading...
Searching...
No Matches
string.cpp
Go to the documentation of this file.
1// Most of this code is borrowed from:
2// Markus Kuhn -- 2007-05-26 (Unicode 5.0)
3// Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
4// Thanks you!
5//
6// Modified by Arthur Sonzogni for FTXUI.
7
9
10#include <array> // for array
11#include <codecvt> // for codecvt_utf8_utf16
12#include <cstdint> // for uint32_t, uint8_t
13#include <locale> // for wstring_convert
14#include <string> // for string, basic_string, wstring
15
16#include "ftxui/screen/deprecated.hpp" // for wchar_width, wstring_width
17
18namespace {
19
20struct Interval {
21 uint32_t first;
22 uint32_t last;
23};
24
25// Sorted list of non-overlapping intervals of non-spacing characters
26// generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c"
27const std::array<Interval, 142> g_combining_characters = {
28 Interval{0x0300, 0x036F}, Interval{0x0483, 0x0486},
29 Interval{0x0488, 0x0489}, Interval{0x0591, 0x05BD},
30 Interval{0x05BF, 0x05BF}, Interval{0x05C1, 0x05C2},
31 Interval{0x05C4, 0x05C5}, Interval{0x05C7, 0x05C7},
32 Interval{0x0600, 0x0603}, Interval{0x0610, 0x0615},
33 Interval{0x064B, 0x065E}, Interval{0x0670, 0x0670},
34 Interval{0x06D6, 0x06E4}, Interval{0x06E7, 0x06E8},
35 Interval{0x06EA, 0x06ED}, Interval{0x070F, 0x070F},
36 Interval{0x0711, 0x0711}, Interval{0x0730, 0x074A},
37 Interval{0x07A6, 0x07B0}, Interval{0x07EB, 0x07F3},
38 Interval{0x0901, 0x0902}, Interval{0x093C, 0x093C},
39 Interval{0x0941, 0x0948}, Interval{0x094D, 0x094D},
40 Interval{0x0951, 0x0954}, Interval{0x0962, 0x0963},
41 Interval{0x0981, 0x0981}, Interval{0x09BC, 0x09BC},
42 Interval{0x09C1, 0x09C4}, Interval{0x09CD, 0x09CD},
43 Interval{0x09E2, 0x09E3}, Interval{0x0A01, 0x0A02},
44 Interval{0x0A3C, 0x0A3C}, Interval{0x0A41, 0x0A42},
45 Interval{0x0A47, 0x0A48}, Interval{0x0A4B, 0x0A4D},
46 Interval{0x0A70, 0x0A71}, Interval{0x0A81, 0x0A82},
47 Interval{0x0ABC, 0x0ABC}, Interval{0x0AC1, 0x0AC5},
48 Interval{0x0AC7, 0x0AC8}, Interval{0x0ACD, 0x0ACD},
49 Interval{0x0AE2, 0x0AE3}, Interval{0x0B01, 0x0B01},
50 Interval{0x0B3C, 0x0B3C}, Interval{0x0B3F, 0x0B3F},
51 Interval{0x0B41, 0x0B43}, Interval{0x0B4D, 0x0B4D},
52 Interval{0x0B56, 0x0B56}, Interval{0x0B82, 0x0B82},
53 Interval{0x0BC0, 0x0BC0}, Interval{0x0BCD, 0x0BCD},
54 Interval{0x0C3E, 0x0C40}, Interval{0x0C46, 0x0C48},
55 Interval{0x0C4A, 0x0C4D}, Interval{0x0C55, 0x0C56},
56 Interval{0x0CBC, 0x0CBC}, Interval{0x0CBF, 0x0CBF},
57 Interval{0x0CC6, 0x0CC6}, Interval{0x0CCC, 0x0CCD},
58 Interval{0x0CE2, 0x0CE3}, Interval{0x0D41, 0x0D43},
59 Interval{0x0D4D, 0x0D4D}, Interval{0x0DCA, 0x0DCA},
60 Interval{0x0DD2, 0x0DD4}, Interval{0x0DD6, 0x0DD6},
61 Interval{0x0E31, 0x0E31}, Interval{0x0E34, 0x0E3A},
62 Interval{0x0E47, 0x0E4E}, Interval{0x0EB1, 0x0EB1},
63 Interval{0x0EB4, 0x0EB9}, Interval{0x0EBB, 0x0EBC},
64 Interval{0x0EC8, 0x0ECD}, Interval{0x0F18, 0x0F19},
65 Interval{0x0F35, 0x0F35}, Interval{0x0F37, 0x0F37},
66 Interval{0x0F39, 0x0F39}, Interval{0x0F71, 0x0F7E},
67 Interval{0x0F80, 0x0F84}, Interval{0x0F86, 0x0F87},
68 Interval{0x0F90, 0x0F97}, Interval{0x0F99, 0x0FBC},
69 Interval{0x0FC6, 0x0FC6}, Interval{0x102D, 0x1030},
70 Interval{0x1032, 0x1032}, Interval{0x1036, 0x1037},
71 Interval{0x1039, 0x1039}, Interval{0x1058, 0x1059},
72 Interval{0x1160, 0x11FF}, Interval{0x135F, 0x135F},
73 Interval{0x1712, 0x1714}, Interval{0x1732, 0x1734},
74 Interval{0x1752, 0x1753}, Interval{0x1772, 0x1773},
75 Interval{0x17B4, 0x17B5}, Interval{0x17B7, 0x17BD},
76 Interval{0x17C6, 0x17C6}, Interval{0x17C9, 0x17D3},
77 Interval{0x17DD, 0x17DD}, Interval{0x180B, 0x180D},
78 Interval{0x18A9, 0x18A9}, Interval{0x1920, 0x1922},
79 Interval{0x1927, 0x1928}, Interval{0x1932, 0x1932},
80 Interval{0x1939, 0x193B}, Interval{0x1A17, 0x1A18},
81 Interval{0x1B00, 0x1B03}, Interval{0x1B34, 0x1B34},
82 Interval{0x1B36, 0x1B3A}, Interval{0x1B3C, 0x1B3C},
83 Interval{0x1B42, 0x1B42}, Interval{0x1B6B, 0x1B73},
84 Interval{0x1DC0, 0x1DCA}, Interval{0x1DFE, 0x1DFF},
85 Interval{0x200B, 0x200F}, Interval{0x202A, 0x202E},
86 Interval{0x2060, 0x2063}, Interval{0x206A, 0x206F},
87 Interval{0x20D0, 0x20EF}, Interval{0x302A, 0x302F},
88 Interval{0x3099, 0x309A}, Interval{0xA806, 0xA806},
89 Interval{0xA80B, 0xA80B}, Interval{0xA825, 0xA826},
90 Interval{0xFB1E, 0xFB1E}, Interval{0xFE00, 0xFE0F},
91 Interval{0xFE20, 0xFE23}, Interval{0xFEFF, 0xFEFF},
92 Interval{0xFFF9, 0xFFFB}, Interval{0x10A01, 0x10A03},
93 Interval{0x10A05, 0x10A06}, Interval{0x10A0C, 0x10A0F},
94 Interval{0x10A38, 0x10A3A}, Interval{0x10A3F, 0x10A3F},
95 Interval{0x1D167, 0x1D169}, Interval{0x1D173, 0x1D182},
96 Interval{0x1D185, 0x1D18B}, Interval{0x1D1AA, 0x1D1AD},
97 Interval{0x1D242, 0x1D244}, Interval{0xE0001, 0xE0001},
98 Interval{0xE0020, 0xE007F}, Interval{0xE0100, 0xE01EF},
99};
100
101const std::array<Interval, 13> g_full_width_characters = {
102 Interval{0x1100, 0x115f}, Interval{0x2329, 0x2329},
103 Interval{0x232a, 0x232a}, Interval{0x2e80, 0x303e},
104 Interval{0x3040, 0xa4cf}, Interval{0xac00, 0xd7a3},
105 Interval{0xf900, 0xfaff}, Interval{0xfe10, 0xfe19},
106 Interval{0xfe30, 0xfe6f}, Interval{0xff00, 0xff60},
107 Interval{0xffe0, 0xffe6}, Interval{0x20000, 0x2fffd},
108 Interval{0x30000, 0x3fffd},
109};
110
111// Find a codepoint inside a sorted list of Interval.
112bool Bisearch(uint32_t ucs, const Interval* table, int max) {
113 if (ucs < table[0].first || ucs > table[max].last) { // NOLINT
114 return false;
115 }
116
117 int min = 0;
118 while (max >= min) {
119 int mid = (min + max) / 2;
120 if (ucs > table[mid].last) { // NOLINT
121 min = mid + 1;
122 } else if (ucs < table[mid].first) { // NOLINT
123 max = mid - 1;
124 } else {
125 return true;
126 }
127 }
128
129 return false;
130}
131
132bool IsCombining(uint32_t ucs) {
133 return Bisearch(ucs, g_combining_characters.data(),
134 g_combining_characters.size() - 1);
135}
136
137bool IsFullWidth(uint32_t ucs) {
138 if (ucs < 0x0300) // Quick path: // NOLINT
139 return false;
140
141 return Bisearch(ucs, g_full_width_characters.data(),
142 g_full_width_characters.size() - 1);
143}
144
145bool IsControl(uint32_t ucs) {
146 if (ucs == 0) {
147 return true;
148 }
149 if (ucs < 32) { // NOLINT
150 return true;
151 }
152 if (ucs >= 0x7f && ucs < 0xa0) { // NOLINT
153 return true;
154 }
155 return false;
156}
157
158int codepoint_width(uint32_t ucs) {
159 if (IsControl(ucs)) {
160 return -1;
161 }
162
163 if (IsCombining(ucs)) {
164 return 0;
165 }
166
167 if (IsFullWidth(ucs)) {
168 return 2;
169 }
170
171 return 1;
172}
173
174// From UTF8 encoded string |input|, eat in between 1 and 4 byte representing
175// one codepoint. Put the codepoint into |ucs|. Start at |start| and update
176// |end| to represent the beginning of the next byte to eat for consecutive
177// executions.
178bool EatCodePoint(const std::string& input,
179 size_t start,
180 size_t* end,
181 uint32_t* ucs) {
182 if (start >= input.size()) {
183 *end = start + 1;
184 return false;
185 }
186 uint8_t byte_1 = input[start];
187
188 // 1 byte string.
189 if ((byte_1 & 0b1000'0000) == 0b0000'0000) { // NOLINT
190 *ucs = byte_1 & 0b0111'1111; // NOLINT
191 *end = start + 1;
192 return true;
193 }
194
195 // 2 byte string.
196 if ((byte_1 & 0b1110'0000) == 0b1100'0000 && // NOLINT
197 start + 1 < input.size()) {
198 uint8_t byte_2 = input[start + 1];
199 *ucs = 0;
200 *ucs += byte_1 & 0b0001'1111; // NOLINT
201 *ucs <<= 6; // NOLINT
202 *ucs += byte_2 & 0b0011'1111; // NOLINT
203 *end = start + 2;
204 return true;
205 }
206
207 // 3 byte string.
208 if ((byte_1 & 0b1111'0000) == 0b1110'0000 && // NOLINT
209 start + 2 < input.size()) {
210 uint8_t byte_2 = input[start + 1];
211 uint8_t byte_3 = input[start + 2];
212 *ucs = 0;
213 *ucs += byte_1 & 0b0000'1111; // NOLINT
214 *ucs <<= 6; // NOLINT
215 *ucs += byte_2 & 0b0011'1111; // NOLINT
216 *ucs <<= 6; // NOLINT
217 *ucs += byte_3 & 0b0011'1111; // NOLINT
218 *end = start + 3;
219 return true;
220 }
221
222 // 4 byte string.
223 if ((byte_1 & 0b1111'1000) == 0b1111'0000 && // NOLINT
224 start + 3 < input.size()) {
225 uint8_t byte_2 = input[start + 1];
226 uint8_t byte_3 = input[start + 2];
227 uint8_t byte_4 = input[start + 3];
228 *ucs = 0;
229 *ucs += byte_1 & 0b0000'0111; // NOLINT
230 *ucs <<= 6; // NOLINT
231 *ucs += byte_2 & 0b0011'1111; // NOLINT
232 *ucs <<= 6; // NOLINT
233 *ucs += byte_3 & 0b0011'1111; // NOLINT
234 *ucs <<= 6; // NOLINT
235 *ucs += byte_4 & 0b0011'1111; // NOLINT
236 *end = start + 4;
237 return true;
238 }
239
240 *end = start + 1;
241 return false;
242}
243
244} // namespace
245
246namespace ftxui {
247int wchar_width(wchar_t ucs) {
248 return codepoint_width(uint32_t(ucs));
249}
250
251int wstring_width(const std::wstring& text) {
252 int width = 0;
253
254 for (const wchar_t& it : text) {
255 int w = wchar_width(it);
256 if (w < 0) {
257 return -1;
258 }
259 width += w;
260 }
261 return width;
262}
263
264int string_width(const std::string& input) {
265 int width = 0;
266 size_t start = 0;
267 while (start < input.size()) {
268 uint32_t codepoint = 0;
269 if (!EatCodePoint(input, start, &start, &codepoint)) {
270 continue;
271 }
272
273 if (IsControl(codepoint)) {
274 continue;
275 }
276
277 if (IsCombining(codepoint)) {
278 continue;
279 }
280
281 if (IsFullWidth(codepoint)) {
282 width += 2;
283 continue;
284 }
285
286 width += 1;
287 }
288 return width;
289}
290
291std::vector<std::string> Utf8ToGlyphs(const std::string& input) {
292 std::vector<std::string> out;
293 std::string current;
294 out.reserve(input.size());
295 size_t start = 0;
296 size_t end = 0;
297 while (start < input.size()) {
298 uint32_t codepoint = 0;
299 if (!EatCodePoint(input, start, &end, &codepoint)) {
300 start = end;
301 continue;
302 }
303
304 std::string append = input.substr(start, end - start);
305 start = end;
306
307 // Ignore control characters.
308 if (IsControl(codepoint)) {
309 continue;
310 }
311
312 // Combining characters are put with the previous glyph they are modifying.
313 if (IsCombining(codepoint)) {
314 if (!out.empty()) {
315 out.back() += append;
316 }
317 continue;
318 }
319
320 // Fullwidth characters take two cells. The second is made of the empty
321 // string to reserve the space the first is taking.
322 if (IsFullWidth(codepoint)) {
323 out.push_back(append);
324 out.emplace_back("");
325 continue;
326 }
327
328 // Normal characters:
329 out.push_back(append);
330 }
331 return out;
332}
333
334int GlyphPosition(const std::string& input, size_t glyph_index, size_t start) {
335 if (glyph_index <= 0) {
336 return 0;
337 }
338 size_t end = 0;
339 while (start < input.size()) {
340 uint32_t codepoint = 0;
341 bool eaten = EatCodePoint(input, start, &end, &codepoint);
342
343 // Ignore invalid, control characters and combining characters.
344 if (!eaten || IsControl(codepoint) || IsCombining(codepoint)) {
345 start = end;
346 continue;
347 }
348
349 // We eat the beginning of the next glyph. If we are eating the one
350 // requested, return its start position immediately.
351 if (glyph_index == 0) {
352 return static_cast<int>(start);
353 }
354
355 // Otherwise, skip this glyph and iterate:
356 glyph_index--;
357 start = end;
358 }
359 return static_cast<int>(input.size());
360}
361
362std::vector<int> CellToGlyphIndex(const std::string& input) {
363 int x = -1;
364 std::vector<int> out;
365 out.reserve(input.size());
366 size_t start = 0;
367 size_t end = 0;
368 while (start < input.size()) {
369 uint32_t codepoint = 0;
370 bool eaten = EatCodePoint(input, start, &end, &codepoint);
371 start = end;
372
373 // Ignore invalid / control characters.
374 if (!eaten || IsControl(codepoint)) {
375 continue;
376 }
377
378 // Combining characters are put with the previous glyph they are modifying.
379 if (IsCombining(codepoint)) {
380 if (x == -1) {
381 ++x;
382 out.push_back(x);
383 }
384 continue;
385 }
386
387 // Fullwidth characters take two cells. The second is made of the empty
388 // string to reserve the space the first is taking.
389 if (IsFullWidth(codepoint)) {
390 ++x;
391 out.push_back(x);
392 out.push_back(x);
393 continue;
394 }
395
396 // Normal characters:
397 ++x;
398 out.push_back(x);
399 }
400 return out;
401}
402
403int GlyphCount(const std::string& input) {
404 int size = 0;
405 size_t start = 0;
406 size_t end = 0;
407 while (start < input.size()) {
408 uint32_t codepoint = 0;
409 bool eaten = EatCodePoint(input, start, &end, &codepoint);
410 start = end;
411
412 // Ignore invalid characters:
413 if (!eaten || IsControl(codepoint)) {
414 continue;
415 }
416
417 // Ignore combining characters, except when they don't have a preceding to
418 // combine with.
419 if (IsCombining(codepoint)) {
420 if (size == 0) {
421 size++;
422 }
423 continue;
424 }
425
426 size++;
427 }
428 return size;
429}
430
431#ifdef _MSC_VER
432#pragma warning(push)
433#pragma warning(disable : 4996) // codecvt_utf8_utf16 is deprecated
434#endif
435
436/// Convert a UTF8 std::string into a std::wstring.
437std::string to_string(const std::wstring& s) {
438 std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
439 return converter.to_bytes(s);
440}
441
442/// Convert a std::wstring into a UTF8 std::string.
443std::wstring to_wstring(const std::string& s) {
444 std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
445 return converter.from_bytes(s);
446}
447
448#ifdef _MSC_VER
449#pragma warning(pop)
450#endif
451
452} // namespace ftxui
453
454// Copyright 2020 Arthur Sonzogni. All rights reserved.
455// Use of this source code is governed by the MIT license that can be found in
456// the LICENSE file.
int wchar_width(wchar_t)
Definition string.cpp:247
std::vector< std::string > Utf8ToGlyphs(const std::string &input)
Definition string.cpp:291
int string_width(const std::string &)
Definition string.cpp:264
std::wstring to_wstring(const std::string &s)
Convert a std::wstring into a UTF8 std::string.
Definition string.cpp:443
std::string to_string(const std::wstring &s)
Convert a UTF8 std::string into a std::wstring.
Definition string.cpp:437
Element text(std::wstring text)
Display a piece of unicode text.
Definition text.cpp:111
int GlyphPosition(const std::string &input, size_t glyph_index, size_t start=0)
Definition string.cpp:334
std::vector< int > CellToGlyphIndex(const std::string &input)
Definition string.cpp:362
int GlyphCount(const std::string &input)
Definition string.cpp:403
Decorator size(Direction, Constraint, int value)
Apply a constraint on the size of an element.
Definition size.cpp:85
int wstring_width(const std::wstring &)
Definition string.cpp:251