Files
gcc/libstdc++-v3/include/bits/unicode.h
Jonathan Wakely ac913d5d51 libstdc++: Avoid -Wmaybe-uninitialized warnings in text_encoding.cc
These variables are only read from if we haven't reached the end of
either range, in which case they're guaranteed to be initialized to the
next alphanumeric character. But we can just initialize them to make the
compiler happy.

libstdc++-v3/ChangeLog:

	* include/bits/unicode.h (__charset_alias_match): Initialize
	__var_a and __var_b.
2024-01-18 12:43:32 +00:00

1126 lines
30 KiB
C++

// Unicode utilities -*- C++ -*-
// Copyright The GNU Toolchain Authors.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
/** @file include/bits/unicode.h
* This is an internal header file, included by other library headers.
* Do not attempt to use it directly. @headername{format}
*/
#ifndef _GLIBCXX_UNICODE_H
#define _GLIBCXX_UNICODE_H 1
#if __cplusplus >= 202002L
#include <array>
#include <bit> // bit_width
#include <charconv> // __detail::__from_chars_alnum_to_val_table
#include <cstdint>
#include <bits/stl_algo.h>
#include <bits/stl_iterator.h>
#include <bits/ranges_base.h>
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
namespace __unicode
{
// A Unicode code point that is not a high or low surrogate.
constexpr bool
__is_scalar_value(char32_t __c)
{
if (__c < 0xD800) [[likely]]
return true;
return 0xDFFF < __c && __c <= 0x10FFFF;
}
// A code point that can be encoded in a single code unit of type _CharT.
template<typename _CharT>
constexpr bool
__is_single_code_unit(char32_t __c)
{
if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)
return __c < 0x7F; // ASCII character
else
return __c < __gnu_cxx::__int_traits<_CharT>::__max
&& __is_scalar_value(__c);
}
// Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
struct _Repl
{
constexpr char32_t
operator()() const noexcept
{ return 0xFFFD; }
};
struct _Null_sentinel_t
{
template<input_iterator _It>
requires default_initializable<iter_value_t<_It>>
&& equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>
friend constexpr auto
operator==(_It __it, _Null_sentinel_t)
{ return *__it == iter_value_t<_It>{}; }
};
template<typename _FromFmt, typename _ToFmt,
input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
typename _ErrorHandler = _Repl>
requires convertible_to<iter_value_t<_Iter>, _FromFmt>
class _Utf_iterator
{
static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));
public:
using value_type = _ToFmt;
using difference_type = iter_difference_t<_Iter>;
using reference = value_type;
using iterator_concept
= std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,
bidirectional_iterator_tag>;
constexpr _Utf_iterator() = default;
constexpr
_Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
requires bidirectional_iterator<_Iter>
: _M_first_and_curr{__first, __it}, _M_last(__last)
{
if (_M_curr() != _M_last)
_M_read();
else
_M_buf = {};
}
constexpr
_Utf_iterator(_Iter __it, _Sent __last)
requires (!bidirectional_iterator<_Iter>)
: _M_first_and_curr{__it}, _M_last(__last)
{
if (_M_curr() != _M_last)
_M_read();
else
_M_buf = {};
}
template<class _Iter2, class _Sent2>
requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
constexpr
_Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
_ErrorHandler>& __other)
: _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
_M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
_M_last(__other._M_last)
{ }
[[nodiscard]]
constexpr _Iter
begin() const requires bidirectional_iterator<_Iter>
{ return _M_first(); }
[[nodiscard]]
constexpr _Sent
end() const { return _M_last; }
[[nodiscard]]
constexpr _Iter
base() const requires forward_iterator<_Iter>
{ return _M_curr(); }
[[nodiscard]]
constexpr value_type
operator*() const { return _M_buf[_M_buf_index]; }
constexpr _Utf_iterator&
operator++()
{
if (_M_buf_index + 1 == _M_buf_last && _M_curr() != _M_last)
{
if constexpr (forward_iterator<_Iter>)
std::advance(_M_curr(), _M_to_increment);
if (_M_curr() == _M_last)
_M_buf_index = 0;
else
_M_read();
}
else if (_M_buf_index + 1 < _M_buf_last)
++_M_buf_index;
return *this;
}
constexpr _Utf_iterator
operator++(int)
{
auto __tmp = *this;
++*this;
return __tmp;
}
constexpr _Utf_iterator&
operator--() requires bidirectional_iterator<_Iter>
{
if (!_M_buf_index && _M_curr() != _M_first())
_M_read_reverse();
else if (_M_buf_index)
--_M_buf_index;
return *this;
}
constexpr _Utf_iterator
operator--(int)
{
auto __tmp = *this;
--*this;
return __tmp;
}
[[nodiscard]]
friend constexpr bool
operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }
{
if constexpr (forward_iterator<_Iter>)
return __lhs._M_curr() == __rhs._M_curr()
&& __lhs._M_buf_index == __rhs._M_buf_index;
else if (__lhs._M_curr() != __rhs._M_curr())
return false;
else if (__lhs._M_buf_index == __rhs._M_buf_index
&& __lhs._M_buf_last == __rhs._M_buf_last)
return true;
else
return __lhs._M_buf_index == __lhs._M_buf_last
&& __rhs._M_buf_index == __rhs._M_buf_last;
}
[[nodiscard]]
friend constexpr bool
operator==(_Utf_iterator __lhs, _Sent __rhs)
{
if constexpr (forward_iterator<_Iter>)
return __lhs._M_curr() == __rhs;
else
return __lhs._M_curr() == __rhs
&& __lhs._M_buf_index == __lhs._M_buf_last;
}
private:
constexpr void
_M_read()
{
if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
_M_read_utf8();
else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
_M_read_utf16();
else
{
static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
_M_read_utf32();
}
}
constexpr void
_M_read_reverse(); // TODO
template<typename>
struct _Guard
{
_Guard(void*, _Iter&) { }
};
template<typename _It> requires forward_iterator<_It>
struct _Guard<_It>
{
constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
_Utf_iterator* _M_this;
_It _M_orig;
};
constexpr void
_M_read_utf8()
{
_Guard<_Iter> __g{this, _M_curr()};
char32_t __c{};
uint8_t __u = *_M_curr()++;
const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
uint8_t __to_incr = 1;
if (__u <= 0x7F) [[likely]] // 0x00 to 0x7F
__c = __u;
else if (__u < 0xC2) [[unlikely]]
__c = _S_error();
else if (_M_curr() == _M_last) [[unlikely]]
__c = _S_error();
else if (__u <= 0xDF) // 0xC2 to 0xDF
{
__c = __u & 0x1F;
__u = *_M_curr();
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
__c = _S_error();
else
{
__c = (__c << 6) | (__u & 0x3F);
++_M_curr();
++__to_incr;
}
}
else if (__u <= 0xEF) // 0xE0 to 0xEF
{
const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;
const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;
__c = __u & 0x0F;
__u = *_M_curr();
if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
__c = _S_error();
else if (++_M_curr() == _M_last) [[unlikely]]
__c = _S_error();
else
{
++__to_incr;
__c = (__c << 6) | (__u & 0x3F);
__u = *_M_curr();
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
__c = _S_error();
else
{
__c = (__c << 6) | (__u & 0x3F);
++_M_curr();
++__to_incr;
}
}
}
else if (__u <= 0xF4) // 0xF0 to 0xF4
{
const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;
const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;
__c = __u & 0x07;
__u = *_M_curr();
if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
__c = _S_error();
else if (++_M_curr() == _M_last) [[unlikely]]
__c = _S_error();
else
{
++__to_incr;
__c = (__c << 6) | (__u & 0x3F);
__u = *_M_curr();
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
__c = _S_error();
else if (++_M_curr() == _M_last) [[unlikely]]
__c = _S_error();
else
{
++__to_incr;
__c = (__c << 6) | (__u & 0x3F);
__u = *_M_curr();
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
__c = _S_error();
else
{
__c = (__c << 6) | (__u & 0x3F);
++_M_curr();
++__to_incr;
}
}
}
}
else [[unlikely]]
__c = _S_error();
_M_update(__c, __to_incr);
}
constexpr void
_M_read_utf16()
{
_Guard<_Iter> __g{this, _M_curr()};
char32_t __c{};
uint16_t __u = *_M_curr()++;
uint8_t __to_incr = 1;
if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
__c = __u;
else if (__u < 0xDC00 && _M_curr() != _M_last)
{
uint16_t __u2 = *_M_curr();
if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]
__c = _S_error();
else
{
++_M_curr();
__to_incr = 2;
uint32_t __x = (__u & 0x3F) << 10 | __u2 & 0x3FF;
uint32_t __w = (__u >> 6) & 0x1F;
__c = (__w + 1) << 16 | __x;
}
}
else
__c = _S_error();
_M_update(__c, __to_incr);
}
constexpr void
_M_read_utf32()
{
_Guard<_Iter> __g{this, _M_curr()};
char32_t __c = *_M_curr()++;
if (!__is_scalar_value(__c)) [[unlikely]]
__c = _S_error();
_M_update(__c, 1);
}
// Encode the code point __c as one or more code units in _M_buf.
constexpr void
_M_update(char32_t __c, uint8_t __to_incr)
{
_M_to_increment = __to_incr;
_M_buf_index = 0;
if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
{
_M_buf[0] = __c;
_M_buf_last = 1;
}
else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
{
if (__is_single_code_unit<_ToFmt>(__c))
{
_M_buf[0] = __c;
_M_buf[1] = 0;
_M_buf_last = 1;
}
else
{
// From http://www.unicode.org/faq/utf_bom.html#utf16-4
const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);
char16_t __lead = __lead_offset + (__c >> 10);
char16_t __trail = 0xDC00 + (__c & 0x3FF);
_M_buf[0] = __lead;
_M_buf[1] = __trail;
_M_buf_last = 2;
}
}
else
{
static_assert(sizeof(_ToFmt) == 1);
int __bits = std::bit_width((uint32_t)__c);
if (__bits <= 7) [[likely]]
{
_M_buf[0] = __c;
_M_buf[1] = _M_buf[2] = _M_buf[3] = 0;
_M_buf_last = 1;
}
else if (__bits <= 11)
{
_M_buf[0] = 0xC0 | (__c >> 6);
_M_buf[1] = 0x80 | (__c & 0x3F);
_M_buf[2] = _M_buf[3] = 0;
_M_buf_last = 2;
}
else if (__bits <= 16)
{
_M_buf[0] = 0xE0 | (__c >> 12);
_M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);
_M_buf[2] = 0x80 | (__c & 0x3F);
_M_buf[3] = 0;
_M_buf_last = 3;
}
else
{
_M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);
_M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);
_M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);
_M_buf[3] = 0x80 | (__c & 0x3F);
_M_buf_last = 4;
}
}
}
constexpr char32_t
_S_error()
{
char32_t __c = _ErrorHandler()();
__glibcxx_assert(__is_scalar_value(__c));
return __c;
}
constexpr _Iter
_M_first() const requires bidirectional_iterator<_Iter>
{ return _M_first_and_curr._M_first; }
constexpr _Iter&
_M_curr() { return _M_first_and_curr._M_curr; }
constexpr _Iter
_M_curr() const { return _M_first_and_curr._M_curr; }
array<value_type, 4 / sizeof(_ToFmt)> _M_buf;
template<typename _It>
struct _First_and_curr
{
_First_and_curr() = default;
constexpr
_First_and_curr(_It __curr) : _M_curr(__curr) { }
template<convertible_to<_It> _It2>
constexpr
_First_and_curr(const _First_and_curr<_It2>& __other)
: _M_curr(__other._M_curr) { }
_It _M_curr;
};
template<typename _It> requires bidirectional_iterator<_It>
struct _First_and_curr<_It>
{
_First_and_curr() = default;
constexpr
_First_and_curr(_It __first, _It __curr)
: _M_first(__first), _M_curr(__curr) { }
template<convertible_to<_It> _It2>
constexpr
_First_and_curr(const _First_and_curr<_It2>& __other)
: _M_first(__other._M_first), _M_curr(__other._M_curr) { }
_It _M_first;
_It _M_curr;
};
_First_and_curr<_Iter> _M_first_and_curr;
uint8_t _M_buf_index = 0;
uint8_t _M_buf_last = 0;
uint8_t _M_to_increment = 0;
[[no_unique_address]] _Sent _M_last;
template<typename _FromFmt2, typename _ToFmt2,
input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,
typename _ErrHandler>
requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>
friend class _Utf_iterator;
};
template<typename _ToFormat, ranges::input_range _Range>
class _Utf_view
: public ranges::view_interface<_Utf_view<_ToFormat, _Range>>
{
using _Iterator = _Utf_iterator<ranges::range_value_t<_Range>,
_ToFormat, ranges::iterator_t<_Range>,
ranges::sentinel_t<_Range>>;
template<typename _Iter, typename _Sent>
constexpr auto
_M_begin(_Iter __first, _Sent __last)
{
if constexpr (bidirectional_iterator<_Iter>)
return _Iterator(__first, __first, __last);
else
return _Iterator(__first, __last);
}
template<typename _Iter, typename _Sent>
constexpr auto
_M_end(_Iter __first, _Sent __last)
{
if constexpr (!is_same_v<_Iter, _Sent>)
return __last;
else if constexpr (bidirectional_iterator<_Iter>)
return _Iterator(__first, __last, __last);
else
return _Iterator(__last, __last);
}
_Range _M_base;
public:
constexpr explicit
_Utf_view(_Range&& __r) : _M_base(std::forward<_Range>(__r)) { }
constexpr auto begin()
{ return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }
constexpr auto end()
{ return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }
constexpr bool empty() const { return ranges::empty(_M_base); }
};
template<typename _View>
using _Utf8_view = _Utf_view<char8_t, _View>;
template<typename _View>
using _Utf16_view = _Utf_view<char16_t, _View>;
template<typename _View>
using _Utf32_view = _Utf_view<char32_t, _View>;
inline namespace __v15_1_0
{
#define _GLIBCXX_GET_UNICODE_DATA 150100
#include "unicode-data.h"
#ifdef _GLIBCXX_GET_UNICODE_DATA
# error "Invalid unicode data"
#endif
// The field width of a code point.
constexpr int
__field_width(char32_t __c) noexcept
{
if (__c < __width_edges[0]) [[likely]]
return 1;
auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
return (__p - __width_edges) % 2 + 1;
}
// @pre c <= 0x10FFFF
constexpr _Gcb_property
__grapheme_cluster_break_property(char32_t __c) noexcept
{
constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
auto* __end = std::end(__gcb_edges);
auto* __p = std::lower_bound(__gcb_edges, __end,
(__c << __gcb_shift_bits) | __mask);
return _Gcb_property(__p[-1] & __mask);
}
constexpr bool
__is_incb_linker(char32_t __c) noexcept
{
const auto __end = std::end(__incb_linkers);
// Array is small enough that linear search is faster than binary search.
return std::find(__incb_linkers, __end, __c) != __end;
}
// @pre c <= 0x10FFFF
constexpr _InCB
__incb_property(char32_t __c) noexcept
{
if ((__c << 2) < __incb_edges[0]) [[likely]]
return _InCB(0);
constexpr uint32_t __mask = 0x3;
auto* __end = std::end(__incb_edges);
auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
return _InCB(__p[-1] & __mask);
}
constexpr bool
__is_extended_pictographic(char32_t __c)
{
if (__c < __xpicto_edges[0]) [[likely]]
return 0;
auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
return (__p - __xpicto_edges) % 2;
}
struct _Grapheme_cluster_iterator_base
{
char32_t _M_c; // First code point in the cluster.
_Gcb_property _M_prop; // GCB property of _M_c.
enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
_XPicto _M_xpicto_seq_state = _XPicto::_Init;
unsigned char _M_RI_count = 0;
bool _M_incb_linker_seen = false;
constexpr void
_M_reset(char32_t __c, _Gcb_property __p)
{
_M_c = __c;
_M_prop = __p;
_M_xpicto_seq_state = _XPicto::_Init;
_M_RI_count = 0;
_M_incb_linker_seen = false;
}
constexpr void
_M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
{
if (_M_xpicto_seq_state == _XPicto::_Failed)
return;
auto __next_state = _XPicto::_Failed;
if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
{
if (__p == _Gcb_property::_Gcb_ZWJ)
{
if (_M_xpicto_seq_state == _XPicto::_Matched)
__next_state = _XPicto::_Zwj;
// We check _M_c here so that we do the lookup at most once,
// and only for clusters containing at least one ZWJ.
else if (__is_extended_pictographic(_M_c))
__next_state = _XPicto::_Zwj;
}
else if (__p == _Gcb_property::_Gcb_Extend)
__next_state = _M_xpicto_seq_state; // no change
}
else // Zwj
{
// This assumes that all \p{Extended_Pictographic} emoji have
// Grapheme_Cluster_Break=Other.
if (__p == _Gcb_property::_Gcb_Other
&& __is_extended_pictographic(__c))
__next_state = _XPicto::_Matched;
}
_M_xpicto_seq_state = __next_state;
}
constexpr void
_M_update_ri_count(_Gcb_property __p)
{
if (__p == _Gcb_property::_Gcb_Regional_Indicator)
++_M_RI_count;
else
_M_RI_count = 0;
}
constexpr void
_M_update_incb_state(char32_t __c, _Gcb_property)
{
if (__is_incb_linker(__c))
_M_incb_linker_seen = true;
}
};
// Split a range into extended grapheme clusters.
template<ranges::forward_range _View> requires ranges::view<_View>
class _Grapheme_cluster_view
: public ranges::view_interface<_Grapheme_cluster_view<_View>>
{
public:
constexpr
_Grapheme_cluster_view(_View __v)
: _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
{ }
constexpr auto begin() const { return _M_begin; }
constexpr auto end() const { return _M_begin.end(); }
private:
struct _Iterator : private _Grapheme_cluster_iterator_base
{
private:
// Iterator over the underlying code points.
using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;
public:
// TODO: Change value_type to be subrange<_U32_iterator> instead?
// Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
// That would be the whole cluster, not just the first code point.
// Would need to store two iterators and find end of current cluster
// on increment, so operator* returns value_type(_M_base, _M_next).
using value_type = char32_t;
using iterator_concept = forward_iterator_tag;
using difference_type = ptrdiff_t;
constexpr
_Iterator(_U32_iterator __i)
: _M_base(__i)
{
if (__i != __i.end())
{
_M_c = *__i;
_M_prop = __grapheme_cluster_break_property(_M_c);
}
}
// The first code point of the current extended grapheme cluster.
constexpr value_type
operator*() const
{ return _M_c; }
constexpr auto
operator->() const
{ return &_M_c; }
// Move to the next extended grapheme cluster.
constexpr _Iterator&
operator++()
{
const auto __end = _M_base.end();
if (_M_base != __end)
{
auto __p_prev = _M_prop;
auto __it = _M_base;
while (++__it != __end)
{
char32_t __c = *__it;
auto __p = __grapheme_cluster_break_property(*__it);
_M_update_xpicto_seq_state(__c, __p);
_M_update_ri_count(__p);
_M_update_incb_state(__c, __p);
if (_M_is_break(__p_prev, __p, __it))
{
// Found a grapheme cluster break
_M_reset(__c, __p);
break;
}
__p_prev = __p;
}
_M_base = __it;
}
return *this;
}
constexpr _Iterator
operator++(int)
{
auto __tmp = *this;
++this;
return __tmp;
}
constexpr bool
operator==(const _Iterator& __i) const
{ return _M_base == __i._M_base; }
// This supports iter != iter.end()
constexpr bool
operator==(const ranges::sentinel_t<_View>& __i) const
{ return _M_base == __i; }
// Iterator to the start of the current cluster.
constexpr auto base() const { return _M_base.base(); }
// The end of the underlying view (not the end of the current cluster!)
constexpr auto end() const { return _M_base.end(); }
// Field width of the first code point in the cluster.
constexpr int
width() const noexcept
{ return __field_width(_M_c); }
private:
_U32_iterator _M_base;
// Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
// http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
// This implements the rules from TR29 revision 43 in Unicode 15.1.0.
// Return true if there is a break between code point with property p1
// and code point with property p2.
constexpr bool
_M_is_break(_Gcb_property __p1, _Gcb_property __p2,
_U32_iterator __curr) const
{
using enum _Gcb_property;
if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
return true; // Break after Control or LF.
if (__p1 == _Gcb_CR)
return __p2 != _Gcb_LF; // Do not break between a CR and LF.
// Rule GB5
if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
return true; // Break before Control, CR or LF.
// Rule GB6
if (__p1 == _Gcb_L)
switch (__p2)
{
case _Gcb_L:
case _Gcb_V:
case _Gcb_LV:
case _Gcb_LVT:
return false; // Do not break Hangul syllable sequences.
default:
return true;
}
// Rule GB7
if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
switch (__p2)
{
case _Gcb_V:
case _Gcb_T:
return false; // Do not break Hangul syllable sequences.
default:
return true;
}
// Rule GB8
if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.
// Rule GB9
if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
return false; // Do not break before extending characters or ZWJ.
// The following GB9x rules only apply to extended grapheme clusters,
// which is what the C++ standard uses (not legacy grapheme clusters).
// Rule GB9a
if (__p2 == _Gcb_SpacingMark)
return false; // Do not break before SpacingMarks,
// Rule GB9b
if (__p1 == _Gcb_Prepend)
return false; // or after Prepend characters.
// Rule GB9c (Unicode 15.1.0)
// Do not break within certain combinations with
// Indic_Conjunct_Break (InCB)=Linker.
if (_M_incb_linker_seen
&& __incb_property(_M_c) == _InCB::_Consonant
&& __incb_property(*__curr) == _InCB::_Consonant)
{
// Match [_M_base, __curr] against regular expression
// Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
bool __have_linker = false;
auto __it = _M_base;
while (++__it != __curr)
{
if (__is_incb_linker(*__it))
__have_linker = true;
else
{
auto __incb = __incb_property(*__it);
if (__incb == _InCB::_Consonant)
__have_linker = false;
else if (__incb != _InCB::_Extend)
break;
}
}
if (__it == __curr && __have_linker)
return false;
}
// Rule GB11
// Do not break within emoji modifier sequences
// or emoji zwj sequences.
if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
return false;
// Rules GB12 and GB13
// Do not break within emoji flag sequences. That is, do not break
// between regional indicator (RI) symbols if there is an odd number
// of RI characters before the break point.
if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
return (_M_RI_count & 1) == 0;
// Rule GB999
return true; // Otherwise, break everywhere.
}
};
_Iterator _M_begin;
};
} // namespace __v15_1_0
// Return the field width of a string.
template<typename _CharT>
constexpr size_t
__field_width(basic_string_view<_CharT> __s)
{
if (__s.empty()) [[unlikely]]
return 0;
_Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
auto __it = __gc.begin();
const auto __end = __gc.end();
size_t __n = __it.width();
while (++__it != __end)
__n += __it.width();
return __n;
}
// Truncate a string to at most `__max` field width units, and return the
// resulting field width.
template<typename _CharT>
constexpr size_t
__truncate(basic_string_view<_CharT>& __s, size_t __max)
{
if (__s.empty()) [[unlikely]]
return 0;
_Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
auto __it = __gc.begin();
const auto __end = __gc.end();
size_t __n = __it.width();
if (__n > __max)
{
__s = {};
return 0;
}
while (++__it != __end)
{
size_t __n2 = __n + __it.width();
if (__n2 > __max)
{
__s = basic_string_view<_CharT>(__s.begin(), __it.base());
return __n;
}
__n = __n2;
}
return __n;
}
template<typename _CharT>
consteval bool
__literal_encoding_is_unicode()
{
if constexpr (is_same_v<_CharT, char8_t>)
return true;
else if constexpr (is_same_v<_CharT, char16_t>)
return true;
else if constexpr (is_same_v<_CharT, char32_t>)
return true;
const char* __enc = "";
#ifdef __GNUC_EXECUTION_CHARSET_NAME
auto __remove_iso10646_prefix = [](const char* __s) {
// GNU iconv allows "ISO-10646/" prefix (case-insensitive).
if (__s[0] == 'I' || __s[0] == 'i')
if (__s[1] == 'S' || __s[1] == 's')
if (__s[2] == 'O' || __s[2] == 'o')
if (string_view(__s + 3).starts_with("-10646/"))
return __s + 10;
return __s;
};
if constexpr (is_same_v<_CharT, char>)
__enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
# if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
else
__enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
# endif
if ((__enc[0] == 'U' || __enc[0] == 'u')
&& (__enc[1] == 'T' || __enc[1] == 't')
&& (__enc[2] == 'F' || __enc[2] == 'f'))
{
__enc += 3;
if (__enc[0] == '-')
++__enc;
if (__enc[0] == '8')
return __enc[1] == '\0' || string_view(__enc + 1) == "//";
else if constexpr (!is_same_v<_CharT, char>)
{
string_view __s(__enc);
if (__s.ends_with("//"))
__s.remove_suffix(2);
return __s == "16" || __s == "32";
}
}
#elif defined __clang_literal_encoding__
if constexpr (is_same_v<_CharT, char>)
__enc = __clang_literal_encoding__;
# if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
else
__enc = __clang_wide_literal_encoding__;
# endif
// Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
string_view __s(__enc);
if (__s == "UTF-8")
return true;
else if constexpr (!is_same_v<_CharT, char>)
return __s == "UTF-16" || __s == "UTF-32";
#endif
return false;
}
consteval bool
__literal_encoding_is_utf8()
{ return __literal_encoding_is_unicode<char>(); }
consteval bool
__literal_encoding_is_extended_ascii()
{
return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
&& 'a' == 0x61 && 'z' == 0x7a;
}
// https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
constexpr bool
__charset_alias_match(string_view __a, string_view __b)
{
// Map alphanumeric chars to their base 64 value, everything else to 127.
auto __map = [](char __c, bool& __num) -> unsigned char {
if (__c == '0') [[unlikely]]
return __num ? 0 : 127;
const auto __v = __detail::__from_chars_alnum_to_val(__c);
__num = __v < 10;
return __v;
};
auto __ptr_a = __a.begin(), __end_a = __a.end();
auto __ptr_b = __b.begin(), __end_b = __b.end();
bool __num_a = false, __num_b = false;
while (true)
{
// Find the value of the next alphanumeric character in each string.
unsigned char __val_a{}, __val_b{};
while (__ptr_a != __end_a
&& (__val_a = __map(*__ptr_a, __num_a)) == 127)
++__ptr_a;
while (__ptr_b != __end_b
&& (__val_b = __map(*__ptr_b, __num_b)) == 127)
++__ptr_b;
// Stop when we reach the end of a string, or get a mismatch.
if (__ptr_a == __end_a)
return __ptr_b == __end_b;
else if (__ptr_b == __end_b)
return false;
else if (__val_a != __val_b)
return false; // Found non-matching characters.
++__ptr_a;
++__ptr_b;
}
return true;
}
} // namespace __unicode
namespace ranges
{
template<typename _To, typename _Range>
inline constexpr bool
enable_borrowed_range<std::__unicode::_Utf_view<_To, _Range>>
= enable_borrowed_range<_Range>;
template<typename _Range>
inline constexpr bool
enable_borrowed_range<std::__unicode::_Grapheme_cluster_view<_Range>>
= enable_borrowed_range<_Range>;
} // namespace ranges
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace std
#endif // C++20
#endif // _GLIBCXX_UNICODE_H