All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
unicodeUtils.h
Go to the documentation of this file.
1//
2// Copyright 2023 Pixar
3//
4// Licensed under the terms set forth in the LICENSE.txt file available at
5// https://openusd.org/license.
6//
7#ifndef PXR_BASE_TF_UNICODE_UTILS_H
8#define PXR_BASE_TF_UNICODE_UTILS_H
9
13
14#include "pxr/pxr.h"
15#include "pxr/base/tf/api.h"
17
18#include <ostream>
19#include <string>
20#include <string_view>
21
22PXR_NAMESPACE_OPEN_SCOPE
23
37public:
40 static constexpr uint32_t ReplacementValue = 0xFFFD;
41
44 static constexpr uint32_t MaximumValue = 0x10FFFF;
45
48 static constexpr std::pair<uint32_t, uint32_t>
49 SurrogateRange = {0xD800, 0xDFFF};
50
52 constexpr TfUtf8CodePoint() = default;
53
56 constexpr explicit TfUtf8CodePoint(uint32_t value) :
57 _value(((value <= MaximumValue) &&
58 ((value < SurrogateRange.first) ||
59 (value > SurrogateRange.second))) ?
60 value : ReplacementValue) {}
61
62 constexpr uint32_t AsUInt32() const { return _value; }
63
64 friend constexpr bool operator==(const TfUtf8CodePoint left,
65 const TfUtf8CodePoint right) {
66 return left._value == right._value;
67 }
68 friend constexpr bool operator!=(const TfUtf8CodePoint left,
69 const TfUtf8CodePoint right) {
70 return left._value != right._value;
71 }
72
73private:
74 uint32_t _value{ReplacementValue};
75};
76
77TF_API std::ostream& operator<<(std::ostream&, const TfUtf8CodePoint);
78
83
85constexpr TfUtf8CodePoint TfUtf8CodePointFromAscii(const char value)
86{
87 return static_cast<unsigned char>(value) < 128 ?
88 TfUtf8CodePoint(static_cast<unsigned char>(value)) :
90}
91
100public:
101 using iterator_category = std::forward_iterator_tag;
103 using difference_type = std::ptrdiff_t;
104 using pointer = void;
106
109 class PastTheEndSentinel final {};
110
119 const std::string_view::const_iterator& it,
120 const std::string_view::const_iterator& end) : _it(it), _end(end) {
121 TF_DEV_AXIOM(_it <= _end);
122 }
123
131 {
132 return TfUtf8CodePoint{_GetCodePoint()};
133 }
134
136 std::string_view::const_iterator GetBase() const
137 {
138 return this->_it;
139 }
140
146 {
147 return (this->_it == rhs._it);
148 }
149
154 bool operator!= (const TfUtf8CodePointIterator& rhs) const
155 {
156 return (this->_it != rhs._it);
157 }
158
165 {
166 // The increment operator should never be called if it's past
167 // the end. The user is expected to have already checked this
168 // condition.
169 TF_DEV_AXIOM(!_IsPastTheEnd());
170 _EncodingLength increment = _GetEncodingLength();
171 // Note that in cases where the encoding is invalid, we move to the
172 // next byte. This is necessary because otherwise the iterator would
173 // never advance and the end condition of == iterator::end() would
174 // never be satisfied. This means that we increment, even if the
175 // encoding length is 0.
176 ++_it;
177 // Only continuation bytes will be consumed after the the first byte.
178 // This avoids consumption of ASCII characters or other starting bytes.
179 auto isContinuation = [](const char c) {
180 const auto uc = static_cast<unsigned char>(c);
181 return (uc >= static_cast<unsigned char>('\x80')) &&
182 (uc < static_cast<unsigned char>('\xc0'));
183 };
184 while ((increment > 1) && !_IsPastTheEnd() && isContinuation(*_it)) {
185 ++_it;
186 --increment;
187 }
188 return *this;
189 }
190
197 {
198 auto temp = *this;
199 ++(*this);
200 return temp;
201 }
202
205 friend bool operator==(const TfUtf8CodePointIterator& lhs,
207 {
208 return lhs._IsPastTheEnd();
209 }
210
211 friend bool operator==(PastTheEndSentinel lhs,
212 const TfUtf8CodePointIterator& rhs)
213 {
214 return rhs == lhs;
215 }
216
217 friend bool operator!=(const TfUtf8CodePointIterator& lhs,
218 PastTheEndSentinel rhs)
219 {
220 return !(lhs == rhs);
221 }
222 friend bool operator!=(PastTheEndSentinel lhs,
223 const TfUtf8CodePointIterator& rhs)
224 {
225 return !(lhs == rhs);
226 }
227
228private:
229 using _EncodingLength = unsigned char;
230
231 // Retrieves the variable encoding length of the UTF-8 character
232 // currently pointed to by the iterator. This can be 1, 2, 3, or 4
233 // depending on the encoding of the UTF-8 character. If the encoding
234 // cannot be determined, this method will return 0.
235 _EncodingLength _GetEncodingLength() const
236 {
237 // already at the end, no valid character sequence
238 if (_IsPastTheEnd())
239 {
240 return 0;
241 }
242 // determine what encoding length the character is
243 // 1-byte characters have a leading 0 sequence
244 // 2-byte characters have a leading 110 sequence
245 // 3-byte characters have a leading 1110 sequence
246 // 4-byte characters have a leading 11110 sequence
247 unsigned char x = static_cast<unsigned char>(*_it);
248 if (x < 0x80)
249 {
250 return 1;
251 }
252 else if ((x >= 0xc0) && (x < 0xe0))
253 {
254 return 2;
255 }
256 else if ((x >= 0xe0) && (x < 0xf0))
257 {
258 return 3;
259 }
260 else if ((x >= 0xf0) && (x < 0xf8))
261 {
262 return 4;
263 }
264 else
265 {
266 // can't determine encoding, this is an error
267 return 0;
268 }
269 }
270
271 // Retrieves the Unicode code point of the next character in the UTF-8
272 // encoded sequence (defined by \a begin) and returns the value in
273 // \a codePoint. This method will return \a true if the encoded
274 // sequence is valid. If the encoding is invalid, this method will
275 // return \a false and \a codePoint will be set to 0.
276 TF_API uint32_t _GetCodePoint() const;
277
278 // Returns true if the iterator at or past the end and can no longer be
279 // dereferenced.
280 bool _IsPastTheEnd() const
281 {
282 return _it >= _end;
283 }
284
285 std::string_view::const_iterator _it;
286 std::string_view::const_iterator _end;
287};
288
322public:
324
325 TfUtf8CodePointView() = default;
326 explicit TfUtf8CodePointView(const std::string_view& view) : _view(view) {}
327
328 inline const_iterator begin() const
329 {
330 return const_iterator{std::cbegin(_view), std::cend(_view)};
331 }
332
336 {
338 }
339
340 inline const_iterator cbegin() const
341 {
342 return begin();
343 }
344
348 {
349 return end();
350 }
351
353 bool empty() const
354 {
355 return _view.empty();
356 }
357
369 {
370 return const_iterator(std::cend(_view), std::cend(_view));
371 }
372
373private:
374 std::string_view _view;
375};
376
386TF_API
387bool TfIsUtf8CodePointXidStart(uint32_t codePoint);
388
393inline bool TfIsUtf8CodePointXidStart(const TfUtf8CodePoint codePoint)
394{
395 return TfIsUtf8CodePointXidStart(codePoint.AsUInt32());
396}
397
407TF_API
408bool TfIsUtf8CodePointXidContinue(uint32_t codePoint);
409
415{
416 return TfIsUtf8CodePointXidContinue(codePoint.AsUInt32());
417}
418
419PXR_NAMESPACE_CLOSE_SCOPE
420
421#endif // PXR_BASE_TF_UNICODE_UTILS_H_
Low-level utilities for informing users of various internal and external diagnostic conditions.
Wrapper for a 32-bit code point value that can be encoded as UTF-8.
Definition: unicodeUtils.h:36
static constexpr std::pair< uint32_t, uint32_t > SurrogateRange
Values in this range (inclusive) cannot be constructed and will be replaced by the replacement code p...
Definition: unicodeUtils.h:49
constexpr TfUtf8CodePoint(uint32_t value)
Construct a UTF-8 valued code point, constrained by the maximum value and surrogate range.
Definition: unicodeUtils.h:56
static constexpr uint32_t MaximumValue
Values higher than this will be replaced with the replacement code point.
Definition: unicodeUtils.h:44
constexpr TfUtf8CodePoint()=default
Construct a code point initialized to the replacement value.
static constexpr uint32_t ReplacementValue
Code points that cannot be decoded or are outside of the valid range will be replaced with this value...
Definition: unicodeUtils.h:40
Defines an iterator over a UTF-8 encoded string that extracts unicode code point values.
Definition: unicodeUtils.h:99
bool operator!=(const TfUtf8CodePointIterator &rhs) const
Determines if two iterators are unequal.
Definition: unicodeUtils.h:154
friend bool operator==(const TfUtf8CodePointIterator &lhs, PastTheEndSentinel)
Checks if the lhs iterator is at or past the end for the underlying string_view
Definition: unicodeUtils.h:205
value_type operator*() const
Retrieves the current UTF-8 character in the sequence as its Unicode code point value.
Definition: unicodeUtils.h:130
TfUtf8CodePointIterator & operator++()
Advances the iterator logically one UTF-8 character sequence in the string.
Definition: unicodeUtils.h:164
TfUtf8CodePointIterator(const std::string_view::const_iterator &it, const std::string_view::const_iterator &end)
Constructs an iterator that can read UTF-8 character sequences from the given starting string_view it...
Definition: unicodeUtils.h:118
std::string_view::const_iterator GetBase() const
Retrieves the wrapped string iterator.
Definition: unicodeUtils.h:136
Model iteration ending when the underlying iterator's end condition has been met.
Definition: unicodeUtils.h:109
Wrapper for a UTF-8 encoded std::string_view that can be iterated over as code points instead of byte...
Definition: unicodeUtils.h:321
bool empty() const
Returns true if the underlying view is empty.
Definition: unicodeUtils.h:353
TfUtf8CodePointIterator::PastTheEndSentinel end() const
The sentinel will compare as equal to any iterator at the end of the underlying string_view
Definition: unicodeUtils.h:335
TfUtf8CodePointIterator::PastTheEndSentinel cend() const
The sentinel will compare as equal to any iterator at the end of the underlying string_view
Definition: unicodeUtils.h:347
const_iterator EndAsIterator() const
Returns an iterator of the same type as begin that identifies the end of the string.
Definition: unicodeUtils.h:368
GF_API std::ostream & operator<<(std::ostream &, const GfBBox3d &)
Output a GfBBox3d using the format [(range) matrix zeroArea].
#define TF_DEV_AXIOM(cond)
The same as TF_AXIOM, but compiled only in dev builds.
Definition: diagnostic.h:205
constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint
The replacement code point can be used to signal that a code point could not be decoded and needed to...
Definition: unicodeUtils.h:81
TF_API bool TfIsUtf8CodePointXidContinue(uint32_t codePoint)
Determines whether the given Unicode codePoint is in the XID_Continue character class.
constexpr TfUtf8CodePoint TfUtf8CodePointFromAscii(const char value)
Constructs a TfUtf8CodePoint from an ASCII charcter (0-127).
Definition: unicodeUtils.h:85
TF_API bool TfIsUtf8CodePointXidStart(uint32_t codePoint)
Determines whether the given Unicode codePoint is in the XID_Start character class.