decode.js 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. "use strict";
  2. var __importDefault = (this && this.__importDefault) || function (mod) {
  3. return (mod && mod.__esModule) ? mod : { "default": mod };
  4. };
  5. Object.defineProperty(exports, "__esModule", { value: true });
  6. exports.decodeXML = exports.decodeHTMLStrict = exports.decodeHTML = exports.determineBranch = exports.BinTrieFlags = exports.fromCodePoint = exports.replaceCodePoint = exports.decodeCodePoint = exports.xmlDecodeTree = exports.htmlDecodeTree = void 0;
  7. var decode_data_html_js_1 = __importDefault(require("./generated/decode-data-html.js"));
  8. exports.htmlDecodeTree = decode_data_html_js_1.default;
  9. var decode_data_xml_js_1 = __importDefault(require("./generated/decode-data-xml.js"));
  10. exports.xmlDecodeTree = decode_data_xml_js_1.default;
  11. var decode_codepoint_js_1 = __importDefault(require("./decode_codepoint.js"));
  12. exports.decodeCodePoint = decode_codepoint_js_1.default;
  13. var decode_codepoint_js_2 = require("./decode_codepoint.js");
  14. Object.defineProperty(exports, "replaceCodePoint", { enumerable: true, get: function () { return decode_codepoint_js_2.replaceCodePoint; } });
  15. Object.defineProperty(exports, "fromCodePoint", { enumerable: true, get: function () { return decode_codepoint_js_2.fromCodePoint; } });
  16. var CharCodes;
  17. (function (CharCodes) {
  18. CharCodes[CharCodes["NUM"] = 35] = "NUM";
  19. CharCodes[CharCodes["SEMI"] = 59] = "SEMI";
  20. CharCodes[CharCodes["ZERO"] = 48] = "ZERO";
  21. CharCodes[CharCodes["NINE"] = 57] = "NINE";
  22. CharCodes[CharCodes["LOWER_A"] = 97] = "LOWER_A";
  23. CharCodes[CharCodes["LOWER_F"] = 102] = "LOWER_F";
  24. CharCodes[CharCodes["LOWER_X"] = 120] = "LOWER_X";
  25. /** Bit that needs to be set to convert an upper case ASCII character to lower case */
  26. CharCodes[CharCodes["To_LOWER_BIT"] = 32] = "To_LOWER_BIT";
  27. })(CharCodes || (CharCodes = {}));
  28. var BinTrieFlags;
  29. (function (BinTrieFlags) {
  30. BinTrieFlags[BinTrieFlags["VALUE_LENGTH"] = 49152] = "VALUE_LENGTH";
  31. BinTrieFlags[BinTrieFlags["BRANCH_LENGTH"] = 16256] = "BRANCH_LENGTH";
  32. BinTrieFlags[BinTrieFlags["JUMP_TABLE"] = 127] = "JUMP_TABLE";
  33. })(BinTrieFlags = exports.BinTrieFlags || (exports.BinTrieFlags = {}));
  34. function getDecoder(decodeTree) {
  35. return function decodeHTMLBinary(str, strict) {
  36. var ret = "";
  37. var lastIdx = 0;
  38. var strIdx = 0;
  39. while ((strIdx = str.indexOf("&", strIdx)) >= 0) {
  40. ret += str.slice(lastIdx, strIdx);
  41. lastIdx = strIdx;
  42. // Skip the "&"
  43. strIdx += 1;
  44. // If we have a numeric entity, handle this separately.
  45. if (str.charCodeAt(strIdx) === CharCodes.NUM) {
  46. // Skip the leading "&#". For hex entities, also skip the leading "x".
  47. var start = strIdx + 1;
  48. var base = 10;
  49. var cp = str.charCodeAt(start);
  50. if ((cp | CharCodes.To_LOWER_BIT) === CharCodes.LOWER_X) {
  51. base = 16;
  52. strIdx += 1;
  53. start += 1;
  54. }
  55. do
  56. cp = str.charCodeAt(++strIdx);
  57. while ((cp >= CharCodes.ZERO && cp <= CharCodes.NINE) ||
  58. (base === 16 &&
  59. (cp | CharCodes.To_LOWER_BIT) >= CharCodes.LOWER_A &&
  60. (cp | CharCodes.To_LOWER_BIT) <= CharCodes.LOWER_F));
  61. if (start !== strIdx) {
  62. var entity = str.substring(start, strIdx);
  63. var parsed = parseInt(entity, base);
  64. if (str.charCodeAt(strIdx) === CharCodes.SEMI) {
  65. strIdx += 1;
  66. }
  67. else if (strict) {
  68. continue;
  69. }
  70. ret += (0, decode_codepoint_js_1.default)(parsed);
  71. lastIdx = strIdx;
  72. }
  73. continue;
  74. }
  75. var resultIdx = 0;
  76. var excess = 1;
  77. var treeIdx = 0;
  78. var current = decodeTree[treeIdx];
  79. for (; strIdx < str.length; strIdx++, excess++) {
  80. treeIdx = determineBranch(decodeTree, current, treeIdx + 1, str.charCodeAt(strIdx));
  81. if (treeIdx < 0)
  82. break;
  83. current = decodeTree[treeIdx];
  84. var masked = current & BinTrieFlags.VALUE_LENGTH;
  85. // If the branch is a value, store it and continue
  86. if (masked) {
  87. // If we have a legacy entity while parsing strictly, just skip the number of bytes
  88. if (!strict || str.charCodeAt(strIdx) === CharCodes.SEMI) {
  89. resultIdx = treeIdx;
  90. excess = 0;
  91. }
  92. // The mask is the number of bytes of the value, including the current byte.
  93. var valueLength = (masked >> 14) - 1;
  94. if (valueLength === 0)
  95. break;
  96. treeIdx += valueLength;
  97. }
  98. }
  99. if (resultIdx !== 0) {
  100. var valueLength = (decodeTree[resultIdx] & BinTrieFlags.VALUE_LENGTH) >> 14;
  101. ret +=
  102. valueLength === 1
  103. ? String.fromCharCode(decodeTree[resultIdx] & ~BinTrieFlags.VALUE_LENGTH)
  104. : valueLength === 2
  105. ? String.fromCharCode(decodeTree[resultIdx + 1])
  106. : String.fromCharCode(decodeTree[resultIdx + 1], decodeTree[resultIdx + 2]);
  107. lastIdx = strIdx - excess + 1;
  108. }
  109. }
  110. return ret + str.slice(lastIdx);
  111. };
  112. }
  113. function determineBranch(decodeTree, current, nodeIdx, char) {
  114. var branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 7;
  115. var jumpOffset = current & BinTrieFlags.JUMP_TABLE;
  116. // Case 1: Single branch encoded in jump offset
  117. if (branchCount === 0) {
  118. return jumpOffset !== 0 && char === jumpOffset ? nodeIdx : -1;
  119. }
  120. // Case 2: Multiple branches encoded in jump table
  121. if (jumpOffset) {
  122. var value = char - jumpOffset;
  123. return value < 0 || value >= branchCount
  124. ? -1
  125. : decodeTree[nodeIdx + value] - 1;
  126. }
  127. // Case 3: Multiple branches encoded in dictionary
  128. // Binary search for the character.
  129. var lo = nodeIdx;
  130. var hi = lo + branchCount - 1;
  131. while (lo <= hi) {
  132. var mid = (lo + hi) >>> 1;
  133. var midVal = decodeTree[mid];
  134. if (midVal < char) {
  135. lo = mid + 1;
  136. }
  137. else if (midVal > char) {
  138. hi = mid - 1;
  139. }
  140. else {
  141. return decodeTree[mid + branchCount];
  142. }
  143. }
  144. return -1;
  145. }
  146. exports.determineBranch = determineBranch;
  147. var htmlDecoder = getDecoder(decode_data_html_js_1.default);
  148. var xmlDecoder = getDecoder(decode_data_xml_js_1.default);
  149. /**
  150. * Decodes an HTML string, allowing for entities not terminated by a semi-colon.
  151. *
  152. * @param str The string to decode.
  153. * @returns The decoded string.
  154. */
  155. function decodeHTML(str) {
  156. return htmlDecoder(str, false);
  157. }
  158. exports.decodeHTML = decodeHTML;
  159. /**
  160. * Decodes an HTML string, requiring all entities to be terminated by a semi-colon.
  161. *
  162. * @param str The string to decode.
  163. * @returns The decoded string.
  164. */
  165. function decodeHTMLStrict(str) {
  166. return htmlDecoder(str, true);
  167. }
  168. exports.decodeHTMLStrict = decodeHTMLStrict;
  169. /**
  170. * Decodes an XML string, requiring all entities to be terminated by a semi-colon.
  171. *
  172. * @param str The string to decode.
  173. * @returns The decoded string.
  174. */
  175. function decodeXML(str) {
  176. return xmlDecoder(str, true);
  177. }
  178. exports.decodeXML = decodeXML;
  179. //# sourceMappingURL=decode.js.map