tokenizer.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. import { CharacterStream, LiteralToken, ParseException, Token, TokenStream, TokenType } from './index.js'
  2. const regexpToken = (stream, tokens) => {
  3. if (tokens.length > 0) {
  4. let token = tokens[tokens.length - 1];
  5. if (token instanceof LiteralToken) {
  6. return false;
  7. }
  8. switch (token.getTokenType()) {
  9. case TokenType.Comma: // ,
  10. case TokenType.Semicolon: // ;
  11. case TokenType.Colon: // :
  12. case TokenType.RightCurly: // }
  13. case TokenType.LeftBracket: // [
  14. case TokenType.LeftParantheses: // (
  15. case TokenType.Assignment: // =
  16. case TokenType.NotEqual: // !=
  17. case TokenType.EqualEqualEqual: // ===
  18. case TokenType.NotEqualEqual: // !==
  19. case TokenType.Equal: // ==
  20. case TokenType.And: // &&
  21. case TokenType.Or: // ||
  22. case TokenType.SqlAnd: // and
  23. case TokenType.SqlOr: // or
  24. case TokenType.SqlNotEqual: // <>
  25. case TokenType.Questionmark: // ?
  26. case TokenType.InstanceOf: // instanceof
  27. case TokenType.Lambda: // => ->
  28. case TokenType.Not: // !
  29. break;
  30. default: return false;
  31. }
  32. }
  33. if (stream.match("/", false)) {
  34. let mark = stream.getPosition();
  35. stream.consume();
  36. stream.startSpan();
  37. let matchedEndQuote = false;
  38. let deep = 0;
  39. let maybeMissForwardSlash = 0;
  40. let maybeMissForwardSlashEnd = 0;
  41. while (stream.hasMore()) {
  42. // Note: escape sequences like \n are parsed in StringLiteral
  43. if (stream.match("\\", true)) {
  44. stream.consume();
  45. continue;
  46. }
  47. if (stream.match("[", false)) {
  48. deep++;
  49. maybeMissForwardSlash = stream.getPosition();
  50. } else if (deep > 0 && stream.match("]", false)) {
  51. deep--;
  52. } else if (stream.match(TokenType.ForwardSlash.literal, true)) {
  53. if (deep === 0) {
  54. if (stream.match("g", true)) {
  55. }
  56. if (stream.match("i", true)) {
  57. }
  58. if (stream.match("m", true)) {
  59. }
  60. if (stream.match("s", true)) {
  61. }
  62. if (stream.match("u", true)) {
  63. }
  64. if (stream.match("y", true)) {
  65. }
  66. matchedEndQuote = true;
  67. break;
  68. } else {
  69. maybeMissForwardSlashEnd = stream.getPosition();
  70. }
  71. }
  72. let ch = stream.consume();
  73. if (ch === '\r' || ch === '\n') {
  74. stream.reset(mark);
  75. return false;
  76. }
  77. }
  78. if (deep !== 0) {
  79. throw new ParseException("Missing ']'", stream.getSpan(maybeMissForwardSlash, maybeMissForwardSlashEnd - 1));
  80. }
  81. if (!matchedEndQuote) {
  82. stream.reset(mark);
  83. return false;
  84. }
  85. let regexpSpan = stream.endSpan();
  86. regexpSpan = stream.getSpan(regexpSpan.getStart() - 1, regexpSpan.getEnd());
  87. tokens.push(new LiteralToken(TokenType.RegexpLiteral, regexpSpan));
  88. return true;
  89. }
  90. return false;
  91. }
  92. const tokenizerString = (stream, tokenType, tokens) => {
  93. // String literal
  94. if (stream.match(tokenType, true)) {
  95. stream.startSpan();
  96. let matchedEndQuote = false;
  97. while (stream.hasMore()) {
  98. // Note: escape sequences like \n are parsed in StringLiteral
  99. if (stream.match("\\", true)) {
  100. stream.consume();
  101. continue;
  102. }
  103. if (stream.match(tokenType.literal, true)) {
  104. matchedEndQuote = true;
  105. break;
  106. }
  107. let ch = stream.consume();
  108. if (tokenType !== TokenType.TripleQuote && (ch === '\r' || ch === '\n')) {
  109. throw new ParseException(tokenType.error + tokenType.error + "定义的字符串不能换行", stream.endSpan());
  110. }
  111. }
  112. if (!matchedEndQuote) {
  113. throw new ParseException("字符串没有结束符" + tokenType.error, stream.endSpan());
  114. }
  115. let stringSpan = stream.endSpan();
  116. stringSpan = stream.getSpan(stringSpan.getStart(), stringSpan.getEnd() - tokenType.literal.length);
  117. tokens.push(new LiteralToken(TokenType.StringLiteral, stringSpan));
  118. return true;
  119. }
  120. return false;
  121. };
  122. const autoNumberType = (span, radix) => {
  123. let value = Number.parseInt(span.getText().substring(2).replace(/\_/g, ''), radix)
  124. if (value > 0x7fffffff || value < -0x80000000) {
  125. return new LiteralToken(TokenType.LongLiteral, span, value);
  126. } else if (value > 127 || value < -128) {
  127. return new LiteralToken(TokenType.IntegerLiteral, span, value);
  128. }
  129. return new LiteralToken(TokenType.ByteLiteral, span, value);
  130. }
  131. const tokenizerNumber = (stream, tokens) => {
  132. if (stream.match('0', false)) {
  133. let index = stream.getPosition();
  134. stream.startSpan();
  135. stream.consume();
  136. if (stream.matchAny(['x', 'X'], true)) {
  137. while (stream.matchDigit(true) || stream.matchAny(["A", "B", "C", "D", "E", "F", "a", "b", "c", "d", "e", "f", "_"], true)) {
  138. ;
  139. }
  140. if (stream.matchAny(["L", "l"], true)) {
  141. let span = stream.endSpan();
  142. let text = span.getText();
  143. tokens.push(new LiteralToken(TokenType.LongLiteral, span, parseInt(text.substring(2, text.length - 1).replace(/\_/g, ''), 16)));
  144. return true;
  145. }
  146. tokens.push(autoNumberType(stream.endSpan(), 16));
  147. return true;
  148. } else if (stream.matchAny(['b', 'B'], true)) {
  149. while (stream.matchAny(['0', '1', '_'], true)) {
  150. ;
  151. }
  152. if (stream.matchAny(["L", "l"], true)) {
  153. let span = stream.endSpan();
  154. let text = span.getText();
  155. tokens.push(new LiteralToken(TokenType.LongLiteral, span, parseInt(text.substring(2, text.length - 1).replace(/\_/g, ''), 2)));
  156. return true;
  157. }
  158. tokens.push(autoNumberType(stream.endSpan(), 2));
  159. return true;
  160. }
  161. stream.reset(index);
  162. }
  163. if (stream.matchDigit(false)) {
  164. let type = TokenType.IntegerLiteral;
  165. stream.startSpan();
  166. while (stream.matchDigit(true) || stream.match('_', true)) {
  167. }
  168. if (stream.match(TokenType.Period.literal, true)) {
  169. if (stream.hasMore()) {
  170. type = TokenType.DoubleLiteral;
  171. while (stream.matchDigit(true) || stream.match('_', true)) {
  172. }
  173. } else {
  174. stream.reset(stream.getPosition() - 1)
  175. }
  176. }
  177. if (stream.matchAny(['b', 'B'], true)) {
  178. if (type === TokenType.DoubleLiteral) {
  179. throw new ParseException('Byte literal can not have a decimal point.', stream.endSpan());
  180. }
  181. type = TokenType.ByteLiteral;
  182. } else if (stream.matchAny(['s', 'S'], true)) {
  183. if (type === TokenType.DoubleLiteral) {
  184. throw new ParseException('Short literal can not have a decimal point.', stream.endSpan());
  185. }
  186. type = TokenType.ShortLiteral;
  187. } else if (stream.matchAny(['l', 'L'], true)) {
  188. if (type === TokenType.DoubleLiteral) {
  189. throw new ParseException('Long literal can not have a decimal point.', stream.endSpan());
  190. }
  191. type = TokenType.LongLiteral;
  192. } else if (stream.matchAny(['f', 'F'], true)) {
  193. type = TokenType.FloatLiteral;
  194. } else if (stream.matchAny(['d', 'D'], true)) {
  195. type = TokenType.DoubleLiteral;
  196. } else if (stream.matchAny(['m', 'M'], true)) {
  197. type = TokenType.DecimalLiteral;
  198. }
  199. tokens.push(new LiteralToken(type, stream.endSpan()));
  200. return true
  201. }
  202. return false;
  203. }
  204. const tokenizerLanguage = (stream, tokens) => {
  205. if (stream.match("```", true)) {
  206. stream.startSpan();
  207. if (stream.matchIdentifierStart(true)) {
  208. while (stream.matchIdentifierPart(true)) {
  209. }
  210. let language = stream.endSpan();
  211. tokens.push(new Token(TokenType.Language, language));
  212. stream.startSpan();
  213. if (!stream.skipUntil("```")) {
  214. throw new ParseException('```需要以```结尾', stream.endSpan());
  215. }
  216. tokens.push(new Token(TokenType.Language, stream.endSpan(-3)));
  217. return true;
  218. } else {
  219. throw new ParseException('```后需要标识语言类型', stream.endSpan());
  220. }
  221. }
  222. return false;
  223. }
  224. const tokenizerIdentifier = (stream, tokens) => {
  225. if (stream.matchIdentifierStart(true)) {
  226. stream.startSpan();
  227. while (stream.matchIdentifierPart(true)) {
  228. }
  229. let identifierSpan = stream.endSpan();
  230. identifierSpan = stream.getSpan(identifierSpan.getStart() - 1, identifierSpan.getEnd());
  231. if ("true" === identifierSpan.getText() || "false" === identifierSpan.getText()) {
  232. tokens.push(new LiteralToken(TokenType.BooleanLiteral, identifierSpan));
  233. } else if ("null" === identifierSpan.getText()) {
  234. tokens.push(new LiteralToken(TokenType.NullLiteral, identifierSpan));
  235. } else if (TokenType.SqlAnd.literal.toUpperCase() === identifierSpan.getText().toUpperCase()) {
  236. tokens.push(new Token(TokenType.SqlAnd, identifierSpan));
  237. } else if (TokenType.SqlOr.literal.toUpperCase() === identifierSpan.getText().toUpperCase()) {
  238. tokens.push(new Token(TokenType.SqlOr, identifierSpan));
  239. } else {
  240. tokens.push(new Token(TokenType.Identifier, identifierSpan));
  241. }
  242. return true;
  243. }
  244. return false;
  245. }
  246. const tokenizerTemplateString = (stream, tokens) => {
  247. if (stream.match("`", true)) {
  248. let begin = stream.getPosition();
  249. let start = begin;
  250. let matchedEndQuote = false;
  251. let subTokens = [];
  252. while (stream.hasMore()) {
  253. if (stream.match("\\", true)) {
  254. stream.consume();
  255. continue;
  256. }
  257. if (stream.match("`", true)) {
  258. matchedEndQuote = true;
  259. break;
  260. }
  261. if (stream.match("${", true)) {
  262. let end = stream.getPosition();
  263. if (start < end - 2) {
  264. subTokens.push(new LiteralToken(TokenType.StringLiteral, stream.endSpan(start, end - 2)));
  265. }
  266. subTokens.push(...tokenizer(stream, [], "}"));
  267. start = stream.getPosition();
  268. continue;
  269. }
  270. stream.consume();
  271. }
  272. let stringSpan = stream.endSpan(begin, stream.getPosition());
  273. let end = stream.getPosition() - 1;
  274. if (end - start > 0) {
  275. subTokens.push(new LiteralToken(TokenType.StringLiteral, stream.endSpan(start, end)));
  276. }
  277. stringSpan = stream.getSpan(stringSpan.getStart() - 1, stringSpan.getEnd());
  278. tokens.push(new LiteralToken(TokenType.StringLiteral, stringSpan, new TokenStream(subTokens)));
  279. return true;
  280. }
  281. return false;
  282. }
  283. const tokenizer = (stream, tokens, except) => {
  284. let leftCount = 0;
  285. let rightCount = 0;
  286. while (stream.hasMore()) {
  287. stream.skipWhiteSpace();
  288. if (except && stream.match(except, true)) {
  289. return tokens;
  290. }
  291. if (stream.match("//", true)) { //注释
  292. stream.skipLine();
  293. continue;
  294. }
  295. if (stream.match("/*", true)) { //多行注释
  296. stream.skipUntil("*/");
  297. continue;
  298. }
  299. // int short double long float byte decimal
  300. if (tokenizerNumber(stream, tokens)) {
  301. continue;
  302. }
  303. // '' "" """ """
  304. if (tokenizerString(stream, TokenType.SingleQuote, tokens) || tokenizerString(stream, TokenType.TripleQuote, tokens) || tokenizerString(stream, TokenType.DoubleQuote, tokens)) {
  305. continue;
  306. }
  307. // regexp
  308. if (regexpToken(stream, tokens)) {
  309. continue;
  310. }
  311. // ``` ```
  312. if (tokenizerLanguage(stream, tokens)) {
  313. continue;
  314. }
  315. // template string
  316. if (tokenizerTemplateString(stream, tokens)) {
  317. continue;
  318. }
  319. // Identifier, keyword, boolean literal, or null literal
  320. if (tokenizerIdentifier(stream, tokens)) {
  321. continue;
  322. }
  323. // lambda
  324. if (stream.matchAny(['=>', '->'], true)) {
  325. tokens.push(new Token(TokenType.Lambda, stream.getSpan(stream.getPosition() - 2, stream.getPosition())));
  326. continue;
  327. }
  328. let outer = false;
  329. // Simple tokens
  330. let sortedTokens = TokenType.getSortedValues();
  331. for (let i = 0, len = sortedTokens.length; i < len; i++) {
  332. let t = sortedTokens[i];
  333. if (t.literal != null) {
  334. if (stream.match(t.literal, true)) {
  335. if (t === TokenType.LeftCurly) {
  336. leftCount++;
  337. }
  338. tokens.push(new Token(t, stream.getSpan(stream.getPosition() - t.literal.length, stream.getPosition())));
  339. outer = true;
  340. break;
  341. }
  342. }
  343. }
  344. if (outer) {
  345. continue;
  346. }
  347. if (leftCount !== rightCount && stream.match("}", true)) {
  348. rightCount++;
  349. tokens.push(new Token(TokenType.RightCurly, stream.getSpan(stream.getPosition() - 1, stream.getPosition())));
  350. continue;
  351. }
  352. if (stream.hasMore()) {
  353. throw new ParseException("Unknown token", stream.getSpan(stream.getPosition(), stream.getPosition() + 1));
  354. }
  355. }
  356. return tokens;
  357. }
  358. export default (source) => {
  359. return tokenizer(new CharacterStream(source, 0, source.length), [])
  360. }