Tokenizer.php 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien.potencier@symfony-project.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\CssSelector;
  11. /**
  12. * Tokenizer lexes a CSS Selector to tokens.
  13. *
  14. * This component is a port of the Python lxml library,
  15. * which is copyright Infrae and distributed under the BSD license.
  16. *
  17. * @author Fabien Potencier <fabien.potencier@symfony-project.com>
  18. */
  19. class Tokenizer
  20. {
  21. /**
  22. * Takes a CSS selector and returns an array holding the Tokens
  23. * it contains.
  24. *
  25. * @param string $s The selector to lex.
  26. *
  27. * @return array Token[]
  28. */
  29. public function tokenize($s)
  30. {
  31. if (function_exists('mb_internal_encoding') && ((int) ini_get('mbstring.func_overload')) & 2) {
  32. $mbEncoding = mb_internal_encoding();
  33. mb_internal_encoding('ASCII');
  34. }
  35. $tokens = array();
  36. $pos = 0;
  37. $s = preg_replace('#/\*.*?\*/#s', '', $s);
  38. while (true) {
  39. if (preg_match('#\s+#A', $s, $match, 0, $pos)) {
  40. $preceding_whitespace_pos = $pos;
  41. $pos += strlen($match[0]);
  42. } else {
  43. $preceding_whitespace_pos = 0;
  44. }
  45. if ($pos >= strlen($s)) {
  46. if (isset($mbEncoding)) {
  47. mb_internal_encoding($mbEncoding);
  48. }
  49. return $tokens;
  50. }
  51. if (preg_match('#[+-]?\d*n(?:[+-]\d+)?#A', $s, $match, 0, $pos) && 'n' !== $match[0]) {
  52. $sym = substr($s, $pos, strlen($match[0]));
  53. $tokens[] = new Token('Symbol', $sym, $pos);
  54. $pos += strlen($match[0]);
  55. continue;
  56. }
  57. $c = $s[$pos];
  58. $c2 = substr($s, $pos, 2);
  59. if (in_array($c2, array('~=', '|=', '^=', '$=', '*=', '::', '!='))) {
  60. $tokens[] = new Token('Token', $c2, $pos);
  61. $pos += 2;
  62. continue;
  63. }
  64. if (in_array($c, array('>', '+', '~', ',', '.', '*', '=', '[', ']', '(', ')', '|', ':', '#'))) {
  65. if (in_array($c, array('.', '#', '[')) && $preceding_whitespace_pos > 0) {
  66. $tokens[] = new Token('Token', ' ', $preceding_whitespace_pos);
  67. }
  68. $tokens[] = new Token('Token', $c, $pos);
  69. ++$pos;
  70. continue;
  71. }
  72. if ('"' === $c || "'" === $c) {
  73. // Quoted string
  74. $old_pos = $pos;
  75. list($sym, $pos) = $this->tokenizeEscapedString($s, $pos);
  76. $tokens[] = new Token('String', $sym, $old_pos);
  77. continue;
  78. }
  79. $old_pos = $pos;
  80. list($sym, $pos) = $this->tokenizeSymbol($s, $pos);
  81. $tokens[] = new Token('Symbol', $sym, $old_pos);
  82. continue;
  83. }
  84. }
  85. /**
  86. * Tokenizes a quoted string (i.e. 'A string quoted with \' characters'),
  87. * and returns an array holding the unquoted string contained by $s and
  88. * the new position from which tokenizing should take over.
  89. *
  90. * @throws SyntaxError When expected closing is not found
  91. *
  92. * @param string $s The selector string containing the quoted string.
  93. * @param integer $pos The starting position for the quoted string.
  94. *
  95. * @return array
  96. */
  97. protected function tokenizeEscapedString($s, $pos)
  98. {
  99. $quote = $s[$pos];
  100. $pos = $pos + 1;
  101. $start = $pos;
  102. while (true) {
  103. $next = strpos($s, $quote, $pos);
  104. if (false === $next) {
  105. throw new SyntaxError(sprintf('Expected closing %s for string in: %s', $quote, substr($s, $start)));
  106. }
  107. $result = substr($s, $start, $next - $start);
  108. if ('\\' === $result[strlen($result) - 1]) {
  109. // next quote character is escaped
  110. $pos = $next + 1;
  111. continue;
  112. }
  113. if (false !== strpos($result, '\\')) {
  114. $result = $this->unescapeStringLiteral($result);
  115. }
  116. return array($result, $next + 1);
  117. }
  118. }
  119. /**
  120. * Unescapes a string literal and returns the unescaped string.
  121. *
  122. * @throws SyntaxError When invalid escape sequence is found
  123. *
  124. * @param string $literal The string literal to unescape.
  125. *
  126. * @return string
  127. */
  128. protected function unescapeStringLiteral($literal)
  129. {
  130. return preg_replace_callback('#(\\\\(?:[A-Fa-f0-9]{1,6}(?:\r\n|\s)?|[^A-Fa-f0-9]))#', function ($matches) use ($literal)
  131. {
  132. if ($matches[0][0] == '\\' && strlen($matches[0]) > 1) {
  133. $matches[0] = substr($matches[0], 1);
  134. if (in_array($matches[0][0], array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'))) {
  135. return chr(trim($matches[0]));
  136. }
  137. } else {
  138. throw new SyntaxError(sprintf('Invalid escape sequence %s in string %s', $matches[0], $literal));
  139. }
  140. }, $literal);
  141. }
  142. /**
  143. * Lexes selector $s and returns an array holding the name of the symbol
  144. * contained in it and the new position from which tokenizing should take
  145. * over.
  146. *
  147. * @throws SyntaxError When Unexpected symbol is found
  148. *
  149. * @param string $s The selector string.
  150. * @param integer $pos The position in $s at which the symbol starts.
  151. *
  152. * @return array
  153. */
  154. protected function tokenizeSymbol($s, $pos)
  155. {
  156. $start = $pos;
  157. if (!preg_match('#[^\w\-]#', $s, $match, PREG_OFFSET_CAPTURE, $pos)) {
  158. // Goes to end of s
  159. return array(substr($s, $start), strlen($s));
  160. }
  161. $matchStart = $match[0][1];
  162. if ($matchStart == $pos) {
  163. throw new SyntaxError(sprintf('Unexpected symbol: %s at %s', $s[$pos], $pos));
  164. }
  165. $result = substr($s, $start, $matchStart - $start);
  166. $pos = $matchStart;
  167. return array($result, $pos);
  168. }
  169. }