| 1 | // Copyright 2007-2010 the V8 project authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #ifndef V8_UNICODE_INL_H_ |
| 6 | #define V8_UNICODE_INL_H_ |
| 7 | |
| 8 | #include "src/unicode.h" |
| 9 | #include "src/base/logging.h" |
| 10 | #include "src/utils.h" |
| 11 | |
| 12 | namespace unibrow { |
| 13 | |
| 14 | #ifndef V8_INTL_SUPPORT |
| 15 | template <class T, int s> bool Predicate<T, s>::get(uchar code_point) { |
| 16 | CacheEntry entry = entries_[code_point & kMask]; |
| 17 | if (entry.code_point() == code_point) return entry.value(); |
| 18 | return CalculateValue(code_point); |
| 19 | } |
| 20 | |
| 21 | template <class T, int s> bool Predicate<T, s>::CalculateValue( |
| 22 | uchar code_point) { |
| 23 | bool result = T::Is(code_point); |
| 24 | entries_[code_point & kMask] = CacheEntry(code_point, result); |
| 25 | return result; |
| 26 | } |
| 27 | |
| 28 | template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n, |
| 29 | uchar* result) { |
| 30 | CacheEntry entry = entries_[c & kMask]; |
| 31 | if (entry.code_point_ == c) { |
| 32 | if (entry.offset_ == 0) { |
| 33 | return 0; |
| 34 | } else { |
| 35 | result[0] = c + entry.offset_; |
| 36 | return 1; |
| 37 | } |
| 38 | } else { |
| 39 | return CalculateValue(c, n, result); |
| 40 | } |
| 41 | } |
| 42 | |
| 43 | template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, |
| 44 | uchar* result) { |
| 45 | bool allow_caching = true; |
| 46 | int length = T::Convert(c, n, result, &allow_caching); |
| 47 | if (allow_caching) { |
| 48 | if (length == 1) { |
| 49 | entries_[c & kMask] = CacheEntry(c, result[0] - c); |
| 50 | return 1; |
| 51 | } else { |
| 52 | entries_[c & kMask] = CacheEntry(c, 0); |
| 53 | return 0; |
| 54 | } |
| 55 | } else { |
| 56 | return length; |
| 57 | } |
| 58 | } |
| 59 | #endif // !V8_INTL_SUPPORT |
| 60 | |
| 61 | // Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they |
| 62 | // stream in. This **must** be followed by a call to ValueOfIncrementalFinish |
| 63 | // when the stream is complete, to ensure incomplete sequences are handled. |
| 64 | uchar Utf8::ValueOfIncremental(const byte** cursor, State* state, |
| 65 | Utf8IncrementalBuffer* buffer) { |
| 66 | DCHECK_NOT_NULL(buffer); |
| 67 | State old_state = *state; |
| 68 | byte next = **cursor; |
| 69 | *cursor += 1; |
| 70 | |
| 71 | if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) { |
| 72 | DCHECK_EQ(0u, *buffer); |
| 73 | return static_cast<uchar>(next); |
| 74 | } |
| 75 | |
| 76 | // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation |
| 77 | // char in that sequence. |
| 78 | Utf8DfaDecoder::Decode(next, state, buffer); |
| 79 | |
| 80 | switch (*state) { |
| 81 | case State::kAccept: { |
| 82 | uchar t = *buffer; |
| 83 | *buffer = 0; |
| 84 | return t; |
| 85 | } |
| 86 | |
| 87 | case State::kReject: |
| 88 | *state = State::kAccept; |
| 89 | *buffer = 0; |
| 90 | |
| 91 | // If we hit a bad byte, we need to determine if we were trying to start |
| 92 | // a sequence or continue one. If we were trying to start a sequence, |
| 93 | // that means it's just an invalid lead byte and we need to continue to |
| 94 | // the next (which we already did above). If we were already in a |
| 95 | // sequence, we need to reprocess this same byte after resetting to the |
| 96 | // initial state. |
| 97 | if (old_state != State::kAccept) { |
| 98 | // We were trying to continue a sequence, so let's reprocess this byte |
| 99 | // next time. |
| 100 | *cursor -= 1; |
| 101 | } |
| 102 | return kBadChar; |
| 103 | |
| 104 | default: |
| 105 | return kIncomplete; |
| 106 | } |
| 107 | } |
| 108 | |
| 109 | unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { |
| 110 | static const int kMask = ~(1 << 6); |
| 111 | if (c <= kMaxOneByteChar) { |
| 112 | str[0] = c; |
| 113 | return 1; |
| 114 | } |
| 115 | str[0] = 0xC0 | (c >> 6); |
| 116 | str[1] = 0x80 | (c & kMask); |
| 117 | return 2; |
| 118 | } |
| 119 | |
| 120 | // Encode encodes the UTF-16 code units c and previous into the given str |
| 121 | // buffer, and combines surrogate code units into single code points. If |
| 122 | // replace_invalid is set to true, orphan surrogate code units will be replaced |
| 123 | // with kBadChar. |
| 124 | unsigned Utf8::Encode(char* str, |
| 125 | uchar c, |
| 126 | int previous, |
| 127 | bool replace_invalid) { |
| 128 | static const int kMask = ~(1 << 6); |
| 129 | if (c <= kMaxOneByteChar) { |
| 130 | str[0] = c; |
| 131 | return 1; |
| 132 | } else if (c <= kMaxTwoByteChar) { |
| 133 | str[0] = 0xC0 | (c >> 6); |
| 134 | str[1] = 0x80 | (c & kMask); |
| 135 | return 2; |
| 136 | } else if (c <= kMaxThreeByteChar) { |
| 137 | DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter)); |
| 138 | if (Utf16::IsSurrogatePair(previous, c)) { |
| 139 | const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; |
| 140 | return Encode(str - kUnmatchedSize, |
| 141 | Utf16::CombineSurrogatePair(previous, c), |
| 142 | Utf16::kNoPreviousCharacter, |
| 143 | replace_invalid) - kUnmatchedSize; |
| 144 | } else if (replace_invalid && |
| 145 | (Utf16::IsLeadSurrogate(c) || |
| 146 | Utf16::IsTrailSurrogate(c))) { |
| 147 | c = kBadChar; |
| 148 | } |
| 149 | str[0] = 0xE0 | (c >> 12); |
| 150 | str[1] = 0x80 | ((c >> 6) & kMask); |
| 151 | str[2] = 0x80 | (c & kMask); |
| 152 | return 3; |
| 153 | } else { |
| 154 | str[0] = 0xF0 | (c >> 18); |
| 155 | str[1] = 0x80 | ((c >> 12) & kMask); |
| 156 | str[2] = 0x80 | ((c >> 6) & kMask); |
| 157 | str[3] = 0x80 | (c & kMask); |
| 158 | return 4; |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | |
| 163 | uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) { |
| 164 | if (length <= 0) return kBadChar; |
| 165 | byte first = bytes[0]; |
| 166 | // Characters between 0000 and 007F are encoded as a single character |
| 167 | if (V8_LIKELY(first <= kMaxOneByteChar)) { |
| 168 | *cursor += 1; |
| 169 | return first; |
| 170 | } |
| 171 | return CalculateValue(bytes, length, cursor); |
| 172 | } |
| 173 | |
| 174 | unsigned Utf8::Length(uchar c, int previous) { |
| 175 | if (c <= kMaxOneByteChar) { |
| 176 | return 1; |
| 177 | } else if (c <= kMaxTwoByteChar) { |
| 178 | return 2; |
| 179 | } else if (c <= kMaxThreeByteChar) { |
| 180 | DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter)); |
| 181 | if (Utf16::IsSurrogatePair(previous, c)) { |
| 182 | return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; |
| 183 | } |
| 184 | return 3; |
| 185 | } else { |
| 186 | return 4; |
| 187 | } |
| 188 | } |
| 189 | |
| 190 | bool Utf8::IsValidCharacter(uchar c) { |
| 191 | return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) || |
| 192 | (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu && |
| 193 | c != kBadChar); |
| 194 | } |
| 195 | |
| 196 | } // namespace unibrow |
| 197 | |
| 198 | #endif // V8_UNICODE_INL_H_ |
| 199 | |