Tokenizer.cs 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
  1. // JsonKit v0.5 - A simple but flexible Json library in a single .cs file.
  2. //
  3. // Copyright (C) 2014 Topten Software (contact@toptensoftware.com) All rights reserved.
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this product
  6. // except in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed under the
  11. // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
  12. // either express or implied. See the License for the specific language governing permissions
  13. // and limitations under the License.
  14. using System;
  15. using System.Collections.Generic;
  16. using System.Text;
  17. using System.IO;
  18. using System.Globalization;
  19. namespace Topten.JsonKit
  20. {
  21. class Tokenizer
  22. {
  23. public Tokenizer(TextReader r, JsonOptions options)
  24. {
  25. _underlying = r;
  26. _options = options;
  27. FillBuffer();
  28. NextChar();
  29. NextToken();
  30. }
  31. private JsonOptions _options;
  32. private StringBuilder _sb = new StringBuilder();
  33. private TextReader _underlying;
  34. private char[] _buf = new char[4096];
  35. private int _pos;
  36. private int _bufUsed;
  37. private StringBuilder _rewindBuffer;
  38. private int _rewindBufferPos;
  39. private LineOffset _currentCharPos;
  40. private char _currentChar;
  41. private Stack<ReaderState> _bookmarks = new Stack<ReaderState>();
  42. public LineOffset CurrentTokenPosition;
  43. public Token CurrentToken;
  44. public LiteralKind LiteralKind;
  45. public string String;
  46. public object LiteralValue
  47. {
  48. get
  49. {
  50. if (CurrentToken != Token.Literal)
  51. throw new InvalidOperationException("token is not a literal");
  52. switch (LiteralKind)
  53. {
  54. case LiteralKind.Null: return null;
  55. case LiteralKind.False: return false;
  56. case LiteralKind.True: return true;
  57. case LiteralKind.String: return String;
  58. case LiteralKind.SignedInteger: return long.Parse(String, CultureInfo.InvariantCulture);
  59. case LiteralKind.UnsignedInteger:
  60. if (String.StartsWith("0x") || String.StartsWith("0X"))
  61. return Convert.ToUInt64(String.Substring(2), 16);
  62. else
  63. return ulong.Parse(String, CultureInfo.InvariantCulture);
  64. case LiteralKind.FloatingPoint: return double.Parse(String, CultureInfo.InvariantCulture);
  65. }
  66. return null;
  67. }
  68. }
  69. public Type LiteralType
  70. {
  71. get
  72. {
  73. if (CurrentToken != Token.Literal)
  74. throw new InvalidOperationException("token is not a literal");
  75. switch (LiteralKind)
  76. {
  77. case LiteralKind.Null: return typeof(Object);
  78. case LiteralKind.False: return typeof(Boolean);
  79. case LiteralKind.True: return typeof(Boolean);
  80. case LiteralKind.String: return typeof(string);
  81. case LiteralKind.SignedInteger: return typeof(long);
  82. case LiteralKind.UnsignedInteger: return typeof(ulong);
  83. case LiteralKind.FloatingPoint: return typeof(double);
  84. }
  85. return null;
  86. }
  87. }
  88. // This object represents the entire state of the reader and is used for rewind
  89. struct ReaderState
  90. {
  91. public ReaderState(Tokenizer tokenizer)
  92. {
  93. _currentCharPos = tokenizer._currentCharPos;
  94. _currentChar = tokenizer._currentChar;
  95. _string = tokenizer.String;
  96. _literalKind = tokenizer.LiteralKind;
  97. _rewindBufferPos = tokenizer._rewindBufferPos;
  98. _currentTokenPos = tokenizer.CurrentTokenPosition;
  99. _currentToken = tokenizer.CurrentToken;
  100. }
  101. public void Apply(Tokenizer tokenizer)
  102. {
  103. tokenizer._currentCharPos = _currentCharPos;
  104. tokenizer._currentChar = _currentChar;
  105. tokenizer._rewindBufferPos = _rewindBufferPos;
  106. tokenizer.CurrentToken = _currentToken;
  107. tokenizer.CurrentTokenPosition = _currentTokenPos;
  108. tokenizer.String = _string;
  109. tokenizer.LiteralKind = _literalKind;
  110. }
  111. private LineOffset _currentCharPos;
  112. private LineOffset _currentTokenPos;
  113. private char _currentChar;
  114. private Token _currentToken;
  115. private LiteralKind _literalKind;
  116. private string _string;
  117. private int _rewindBufferPos;
  118. }
  119. // Create a rewind bookmark
  120. public void CreateBookmark()
  121. {
  122. _bookmarks.Push(new ReaderState(this));
  123. if (_rewindBuffer == null)
  124. {
  125. _rewindBuffer = new StringBuilder();
  126. _rewindBufferPos = 0;
  127. }
  128. }
  129. // Discard bookmark
  130. public void DiscardBookmark()
  131. {
  132. _bookmarks.Pop();
  133. if (_bookmarks.Count == 0)
  134. {
  135. _rewindBuffer = null;
  136. _rewindBufferPos = 0;
  137. }
  138. }
  139. // Rewind to a bookmark
  140. public void RewindToBookmark()
  141. {
  142. _bookmarks.Pop().Apply(this);
  143. }
  144. // Fill buffer by reading from underlying TextReader
  145. void FillBuffer()
  146. {
  147. _bufUsed = _underlying.Read(_buf, 0, _buf.Length);
  148. _pos = 0;
  149. }
  150. // Get the next character from the input stream
  151. // (this function could be extracted into a few different methods, but is mostly inlined
  152. // for performance - yes it makes a difference)
  153. public char NextChar()
  154. {
  155. if (_rewindBuffer == null)
  156. {
  157. if (_pos >= _bufUsed)
  158. {
  159. if (_bufUsed > 0)
  160. {
  161. FillBuffer();
  162. }
  163. if (_bufUsed == 0)
  164. {
  165. return _currentChar = '\0';
  166. }
  167. }
  168. // Next
  169. _currentCharPos.Offset++;
  170. return _currentChar = _buf[_pos++];
  171. }
  172. if (_rewindBufferPos < _rewindBuffer.Length)
  173. {
  174. _currentCharPos.Offset++;
  175. return _currentChar = _rewindBuffer[_rewindBufferPos++];
  176. }
  177. else
  178. {
  179. if (_pos >= _bufUsed && _bufUsed > 0)
  180. FillBuffer();
  181. _currentChar = _bufUsed == 0 ? '\0' : _buf[_pos++];
  182. _rewindBuffer.Append(_currentChar);
  183. _rewindBufferPos++;
  184. _currentCharPos.Offset++;
  185. return _currentChar;
  186. }
  187. }
  188. // Read the next token from the input stream
  189. // (Mostly inline for performance)
  190. public void NextToken()
  191. {
  192. while (true)
  193. {
  194. // Skip whitespace and handle line numbers
  195. while (true)
  196. {
  197. if (_currentChar == '\r')
  198. {
  199. if (NextChar() == '\n')
  200. {
  201. NextChar();
  202. }
  203. _currentCharPos.Line++;
  204. _currentCharPos.Offset = 0;
  205. }
  206. else if (_currentChar == '\n')
  207. {
  208. if (NextChar() == '\r')
  209. {
  210. NextChar();
  211. }
  212. _currentCharPos.Line++;
  213. _currentCharPos.Offset = 0;
  214. }
  215. else if (_currentChar == ' ')
  216. {
  217. NextChar();
  218. }
  219. else if (_currentChar == '\t')
  220. {
  221. NextChar();
  222. }
  223. else
  224. break;
  225. }
  226. // Remember position of token
  227. CurrentTokenPosition = _currentCharPos;
  228. // Handle common characters first
  229. switch (_currentChar)
  230. {
  231. case '/':
  232. // Comments not support in strict mode
  233. if ((_options & JsonOptions.StrictParser) != 0)
  234. {
  235. throw new InvalidDataException(string.Format("syntax error, unexpected character '{0}'", _currentChar));
  236. }
  237. // Process comment
  238. NextChar();
  239. switch (_currentChar)
  240. {
  241. case '/':
  242. NextChar();
  243. while (_currentChar!='\0' && _currentChar != '\r' && _currentChar != '\n')
  244. {
  245. NextChar();
  246. }
  247. break;
  248. case '*':
  249. bool endFound = false;
  250. while (!endFound && _currentChar!='\0')
  251. {
  252. if (_currentChar == '*')
  253. {
  254. NextChar();
  255. if (_currentChar == '/')
  256. {
  257. endFound = true;
  258. }
  259. }
  260. NextChar();
  261. }
  262. break;
  263. default:
  264. throw new InvalidDataException("syntax error, unexpected character after slash");
  265. }
  266. continue;
  267. case '\"':
  268. case '\'':
  269. {
  270. _sb.Length = 0;
  271. var quoteKind = _currentChar;
  272. NextChar();
  273. while (_currentChar!='\0')
  274. {
  275. if (_currentChar == '\\')
  276. {
  277. NextChar();
  278. var escape = _currentChar;
  279. switch (escape)
  280. {
  281. case '\"': _sb.Append('\"'); break;
  282. case '\\': _sb.Append('\\'); break;
  283. case '/': _sb.Append('/'); break;
  284. case 'b': _sb.Append('\b'); break;
  285. case 'f': _sb.Append('\f'); break;
  286. case 'n': _sb.Append('\n'); break;
  287. case 'r': _sb.Append('\r'); break;
  288. case 't': _sb.Append('\t'); break;
  289. case 'u':
  290. var sbHex = new StringBuilder();
  291. for (int i = 0; i < 4; i++)
  292. {
  293. NextChar();
  294. sbHex.Append(_currentChar);
  295. }
  296. _sb.Append((char)Convert.ToUInt16(sbHex.ToString(), 16));
  297. break;
  298. default:
  299. throw new InvalidDataException(string.Format("Invalid escape sequence in string literal: '\\{0}'", _currentChar));
  300. }
  301. }
  302. else if (_currentChar == quoteKind)
  303. {
  304. String = _sb.ToString();
  305. CurrentToken = Token.Literal;
  306. LiteralKind = LiteralKind.String;
  307. NextChar();
  308. return;
  309. }
  310. else
  311. {
  312. _sb.Append(_currentChar);
  313. }
  314. NextChar();
  315. }
  316. throw new InvalidDataException("syntax error, unterminated string literal");
  317. }
  318. case '{': CurrentToken = Token.OpenBrace; NextChar(); return;
  319. case '}': CurrentToken = Token.CloseBrace; NextChar(); return;
  320. case '[': CurrentToken = Token.OpenSquare; NextChar(); return;
  321. case ']': CurrentToken = Token.CloseSquare; NextChar(); return;
  322. case '=': CurrentToken = Token.Equal; NextChar(); return;
  323. case ':': CurrentToken = Token.Colon; NextChar(); return;
  324. case ';': CurrentToken = Token.SemiColon; NextChar(); return;
  325. case ',': CurrentToken = Token.Comma; NextChar(); return;
  326. case '\0': CurrentToken = Token.EOF; return;
  327. }
  328. // Number?
  329. if (char.IsDigit(_currentChar) || _currentChar == '-')
  330. {
  331. TokenizeNumber();
  332. return;
  333. }
  334. // Identifier? (checked for after everything else as identifiers are actually quite rare in valid json)
  335. if (Char.IsLetter(_currentChar) || _currentChar == '_' || _currentChar == '$')
  336. {
  337. // Find end of identifier
  338. _sb.Length = 0;
  339. while (Char.IsLetterOrDigit(_currentChar) || _currentChar == '_' || _currentChar == '$')
  340. {
  341. _sb.Append(_currentChar);
  342. NextChar();
  343. }
  344. String = _sb.ToString();
  345. // Handle special identifiers
  346. switch (String)
  347. {
  348. case "true":
  349. LiteralKind = LiteralKind.True;
  350. CurrentToken = Token.Literal;
  351. return;
  352. case "false":
  353. LiteralKind = LiteralKind.False;
  354. CurrentToken = Token.Literal;
  355. return;
  356. case "null":
  357. LiteralKind = LiteralKind.Null;
  358. CurrentToken = Token.Literal;
  359. return;
  360. }
  361. CurrentToken = Token.Identifier;
  362. return;
  363. }
  364. // What the?
  365. throw new InvalidDataException(string.Format("syntax error, unexpected character '{0}'", _currentChar));
  366. }
  367. }
  368. // Parse a sequence of characters that could make up a valid number
  369. // For performance, we don't actually parse it into a number yet. When using Topten.JsonKitEmit we parse
  370. // later, directly into a value type to avoid boxing
  371. private void TokenizeNumber()
  372. {
  373. _sb.Length = 0;
  374. // Leading negative sign
  375. bool signed = false;
  376. if (_currentChar == '-')
  377. {
  378. signed = true;
  379. _sb.Append(_currentChar);
  380. NextChar();
  381. }
  382. // Hex prefix?
  383. bool hex = false;
  384. if (_currentChar == '0' && (_options & JsonOptions.StrictParser)==0)
  385. {
  386. _sb.Append(_currentChar);
  387. NextChar();
  388. if (_currentChar == 'x' || _currentChar == 'X')
  389. {
  390. _sb.Append(_currentChar);
  391. NextChar();
  392. hex = true;
  393. }
  394. }
  395. // Process characters, but vaguely figure out what type it is
  396. bool cont = true;
  397. bool fp = false;
  398. while (cont)
  399. {
  400. switch (_currentChar)
  401. {
  402. case '0':
  403. case '1':
  404. case '2':
  405. case '3':
  406. case '4':
  407. case '5':
  408. case '6':
  409. case '7':
  410. case '8':
  411. case '9':
  412. _sb.Append(_currentChar);
  413. NextChar();
  414. break;
  415. case 'A':
  416. case 'a':
  417. case 'B':
  418. case 'b':
  419. case 'C':
  420. case 'c':
  421. case 'D':
  422. case 'd':
  423. case 'F':
  424. case 'f':
  425. if (!hex)
  426. cont = false;
  427. else
  428. {
  429. _sb.Append(_currentChar);
  430. NextChar();
  431. }
  432. break;
  433. case '.':
  434. if (hex)
  435. {
  436. cont = false;
  437. }
  438. else
  439. {
  440. fp = true;
  441. _sb.Append(_currentChar);
  442. NextChar();
  443. }
  444. break;
  445. case 'E':
  446. case 'e':
  447. if (!hex)
  448. {
  449. fp = true;
  450. _sb.Append(_currentChar);
  451. NextChar();
  452. if (_currentChar == '+' || _currentChar == '-')
  453. {
  454. _sb.Append(_currentChar);
  455. NextChar();
  456. }
  457. }
  458. break;
  459. default:
  460. cont = false;
  461. break;
  462. }
  463. }
  464. if (char.IsLetter(_currentChar))
  465. throw new InvalidDataException(string.Format("syntax error, invalid character following number '{0}'", _sb.ToString()));
  466. // Setup token
  467. String = _sb.ToString();
  468. CurrentToken = Token.Literal;
  469. // Setup literal kind
  470. if (fp)
  471. {
  472. LiteralKind = LiteralKind.FloatingPoint;
  473. }
  474. else if (signed)
  475. {
  476. LiteralKind = LiteralKind.SignedInteger;
  477. }
  478. else
  479. {
  480. LiteralKind = LiteralKind.UnsignedInteger;
  481. }
  482. }
  483. // Check the current token, throw exception if mismatch
  484. public void Check(Token tokenRequired)
  485. {
  486. if (tokenRequired != CurrentToken)
  487. {
  488. throw new InvalidDataException(string.Format("syntax error, expected {0} found {1}", tokenRequired, CurrentToken));
  489. }
  490. }
  491. // Skip token which must match
  492. public void Skip(Token tokenRequired)
  493. {
  494. Check(tokenRequired);
  495. NextToken();
  496. }
  497. // Skip token if it matches
  498. public bool SkipIf(Token tokenRequired)
  499. {
  500. if (tokenRequired == CurrentToken)
  501. {
  502. NextToken();
  503. return true;
  504. }
  505. return false;
  506. }
  507. }
  508. }