Crawler.php 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\CssSelector;
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. *
  17. * @api
  18. */
  19. class Crawler extends \SplObjectStorage
  20. {
  21. /**
  22. * @var string The current URI or the base href value
  23. */
  24. private $uri;
  25. /**
  26. * Constructor.
  27. *
  28. * @param mixed $node A Node to use as the base for the crawling
  29. * @param string $uri The current URI or the base href value
  30. *
  31. * @api
  32. */
  33. public function __construct($node = null, $uri = null)
  34. {
  35. $this->uri = $uri;
  36. $this->add($node);
  37. }
  38. /**
  39. * Removes all the nodes.
  40. *
  41. * @api
  42. */
  43. public function clear()
  44. {
  45. $this->removeAll($this);
  46. }
  47. /**
  48. * Adds a node to the current list of nodes.
  49. *
  50. * This method uses the appropriate specialized add*() method based
  51. * on the type of the argument.
  52. *
  53. * @param null|\DOMNodeList|array|\DOMNode $node A node
  54. *
  55. * @api
  56. */
  57. public function add($node)
  58. {
  59. if ($node instanceof \DOMNodeList) {
  60. $this->addNodeList($node);
  61. } elseif (is_array($node)) {
  62. $this->addNodes($node);
  63. } elseif (is_string($node)) {
  64. $this->addContent($node);
  65. } elseif (is_object($node)) {
  66. $this->addNode($node);
  67. }
  68. }
  69. /**
  70. * Adds HTML/XML content.
  71. *
  72. * @param string $content A string to parse as HTML/XML
  73. * @param null|string $type The content type of the string
  74. *
  75. * @return null|void
  76. */
  77. public function addContent($content, $type = null)
  78. {
  79. if (empty($type)) {
  80. $type = 'text/html';
  81. }
  82. // DOM only for HTML/XML content
  83. if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
  84. return null;
  85. }
  86. $charset = 'ISO-8859-1';
  87. if (false !== $pos = strpos($type, 'charset=')) {
  88. $charset = substr($type, $pos + 8);
  89. if (false !== $pos = strpos($charset, ';')) {
  90. $charset = substr($charset, 0, $pos);
  91. }
  92. }
  93. if ('x' === $matches[1]) {
  94. $this->addXmlContent($content, $charset);
  95. } else {
  96. $this->addHtmlContent($content, $charset);
  97. }
  98. }
  99. /**
  100. * Adds an HTML content to the list of nodes.
  101. *
  102. * @param string $content The HTML content
  103. * @param string $charset The charset
  104. *
  105. * @api
  106. */
  107. public function addHtmlContent($content, $charset = 'UTF-8')
  108. {
  109. $disableEntities = libxml_disable_entity_loader(true);
  110. $dom = new \DOMDocument('1.0', $charset);
  111. $dom->validateOnParse = true;
  112. @$dom->loadHTML($content);
  113. libxml_disable_entity_loader($disableEntities);
  114. $this->addDocument($dom);
  115. $base = $this->filter('base')->extract(array('href'));
  116. if (count($base)) {
  117. $this->uri = current($base);
  118. }
  119. }
  120. /**
  121. * Adds an XML content to the list of nodes.
  122. *
  123. * @param string $content The XML content
  124. * @param string $charset The charset
  125. *
  126. * @api
  127. */
  128. public function addXmlContent($content, $charset = 'UTF-8')
  129. {
  130. $disableEntities = libxml_disable_entity_loader(true);
  131. $dom = new \DOMDocument('1.0', $charset);
  132. $dom->validateOnParse = true;
  133. // remove the default namespace to make XPath expressions simpler
  134. @$dom->loadXML(str_replace('xmlns', 'ns', $content), LIBXML_NONET);
  135. libxml_disable_entity_loader($disableEntities);
  136. $this->addDocument($dom);
  137. }
  138. /**
  139. * Adds a \DOMDocument to the list of nodes.
  140. *
  141. * @param \DOMDocument $dom A \DOMDocument instance
  142. *
  143. * @api
  144. */
  145. public function addDocument(\DOMDocument $dom)
  146. {
  147. if ($dom->documentElement) {
  148. $this->addNode($dom->documentElement);
  149. }
  150. }
  151. /**
  152. * Adds a \DOMNodeList to the list of nodes.
  153. *
  154. * @param \DOMNodeList $nodes A \DOMNodeList instance
  155. *
  156. * @api
  157. */
  158. public function addNodeList(\DOMNodeList $nodes)
  159. {
  160. foreach ($nodes as $node) {
  161. $this->addNode($node);
  162. }
  163. }
  164. /**
  165. * Adds an array of \DOMNode instances to the list of nodes.
  166. *
  167. * @param array $nodes An array of \DOMNode instances
  168. *
  169. * @api
  170. */
  171. public function addNodes(array $nodes)
  172. {
  173. foreach ($nodes as $node) {
  174. $this->add($node);
  175. }
  176. }
  177. /**
  178. * Adds a \DOMNode instance to the list of nodes.
  179. *
  180. * @param \DOMNode $node A \DOMNode instance
  181. *
  182. * @api
  183. */
  184. public function addNode(\DOMNode $node)
  185. {
  186. if ($node instanceof \DOMDocument) {
  187. $this->attach($node->documentElement);
  188. } else {
  189. $this->attach($node);
  190. }
  191. }
  192. /**
  193. * Returns a node given its position in the node list.
  194. *
  195. * @param integer $position The position
  196. *
  197. * @return Crawler A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist.
  198. *
  199. * @api
  200. */
  201. public function eq($position)
  202. {
  203. foreach ($this as $i => $node) {
  204. if ($i == $position) {
  205. return new static($node, $this->uri);
  206. }
  207. }
  208. return new static(null, $this->uri);
  209. }
  210. /**
  211. * Calls an anonymous function on each node of the list.
  212. *
  213. * The anonymous function receives the position and the node as arguments.
  214. *
  215. * Example:
  216. *
  217. * $crawler->filter('h1')->each(function ($node, $i)
  218. * {
  219. * return $node->nodeValue;
  220. * });
  221. *
  222. * @param \Closure $closure An anonymous function
  223. *
  224. * @return array An array of values returned by the anonymous function
  225. *
  226. * @api
  227. */
  228. public function each(\Closure $closure)
  229. {
  230. $data = array();
  231. foreach ($this as $i => $node) {
  232. $data[] = $closure($node, $i);
  233. }
  234. return $data;
  235. }
  236. /**
  237. * Reduces the list of nodes by calling an anonymous function.
  238. *
  239. * To remove a node from the list, the anonymous function must return false.
  240. *
  241. * @param \Closure $closure An anonymous function
  242. *
  243. * @return Crawler A Crawler instance with the selected nodes.
  244. *
  245. * @api
  246. */
  247. public function reduce(\Closure $closure)
  248. {
  249. $nodes = array();
  250. foreach ($this as $i => $node) {
  251. if (false !== $closure($node, $i)) {
  252. $nodes[] = $node;
  253. }
  254. }
  255. return new static($nodes, $this->uri);
  256. }
  257. /**
  258. * Returns the first node of the current selection
  259. *
  260. * @return Crawler A Crawler instance with the first selected node
  261. *
  262. * @api
  263. */
  264. public function first()
  265. {
  266. return $this->eq(0);
  267. }
  268. /**
  269. * Returns the last node of the current selection
  270. *
  271. * @return Crawler A Crawler instance with the last selected node
  272. *
  273. * @api
  274. */
  275. public function last()
  276. {
  277. return $this->eq(count($this) - 1);
  278. }
  279. /**
  280. * Returns the siblings nodes of the current selection
  281. *
  282. * @return Crawler A Crawler instance with the sibling nodes
  283. *
  284. * @throws \InvalidArgumentException When current node is empty
  285. *
  286. * @api
  287. */
  288. public function siblings()
  289. {
  290. if (!count($this)) {
  291. throw new \InvalidArgumentException('The current node list is empty.');
  292. }
  293. return new static($this->sibling($this->getNode(0)->parentNode->firstChild), $this->uri);
  294. }
  295. /**
  296. * Returns the next siblings nodes of the current selection
  297. *
  298. * @return Crawler A Crawler instance with the next sibling nodes
  299. *
  300. * @throws \InvalidArgumentException When current node is empty
  301. *
  302. * @api
  303. */
  304. public function nextAll()
  305. {
  306. if (!count($this)) {
  307. throw new \InvalidArgumentException('The current node list is empty.');
  308. }
  309. return new static($this->sibling($this->getNode(0)), $this->uri);
  310. }
  311. /**
  312. * Returns the previous sibling nodes of the current selection
  313. *
  314. * @return Crawler A Crawler instance with the previous sibling nodes
  315. *
  316. * @api
  317. */
  318. public function previousAll()
  319. {
  320. if (!count($this)) {
  321. throw new \InvalidArgumentException('The current node list is empty.');
  322. }
  323. return new static($this->sibling($this->getNode(0), 'previousSibling'), $this->uri);
  324. }
  325. /**
  326. * Returns the parents nodes of the current selection
  327. *
  328. * @return Crawler A Crawler instance with the parents nodes of the current selection
  329. *
  330. * @throws \InvalidArgumentException When current node is empty
  331. *
  332. * @api
  333. */
  334. public function parents()
  335. {
  336. if (!count($this)) {
  337. throw new \InvalidArgumentException('The current node list is empty.');
  338. }
  339. $node = $this->getNode(0);
  340. $nodes = array();
  341. while ($node = $node->parentNode) {
  342. if (1 === $node->nodeType && '_root' !== $node->nodeName) {
  343. $nodes[] = $node;
  344. }
  345. }
  346. return new static($nodes, $this->uri);
  347. }
  348. /**
  349. * Returns the children nodes of the current selection
  350. *
  351. * @return Crawler A Crawler instance with the children nodes
  352. *
  353. * @throws \InvalidArgumentException When current node is empty
  354. *
  355. * @api
  356. */
  357. public function children()
  358. {
  359. if (!count($this)) {
  360. throw new \InvalidArgumentException('The current node list is empty.');
  361. }
  362. return new static($this->sibling($this->getNode(0)->firstChild), $this->uri);
  363. }
  364. /**
  365. * Returns the attribute value of the first node of the list.
  366. *
  367. * @param string $attribute The attribute name
  368. *
  369. * @return string The attribute value
  370. *
  371. * @throws \InvalidArgumentException When current node is empty
  372. *
  373. * @api
  374. */
  375. public function attr($attribute)
  376. {
  377. if (!count($this)) {
  378. throw new \InvalidArgumentException('The current node list is empty.');
  379. }
  380. return $this->getNode(0)->getAttribute($attribute);
  381. }
  382. /**
  383. * Returns the node value of the first node of the list.
  384. *
  385. * @return string The node value
  386. *
  387. * @throws \InvalidArgumentException When current node is empty
  388. *
  389. * @api
  390. */
  391. public function text()
  392. {
  393. if (!count($this)) {
  394. throw new \InvalidArgumentException('The current node list is empty.');
  395. }
  396. return $this->getNode(0)->nodeValue;
  397. }
  398. /**
  399. * Extracts information from the list of nodes.
  400. *
  401. * You can extract attributes or/and the node value (_text).
  402. *
  403. * Example:
  404. *
  405. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  406. *
  407. * @param array $attributes An array of attributes
  408. *
  409. * @return array An array of extracted values
  410. *
  411. * @api
  412. */
  413. public function extract($attributes)
  414. {
  415. $attributes = (array) $attributes;
  416. $data = array();
  417. foreach ($this as $node) {
  418. $elements = array();
  419. foreach ($attributes as $attribute) {
  420. if ('_text' === $attribute) {
  421. $elements[] = $node->nodeValue;
  422. } else {
  423. $elements[] = $node->getAttribute($attribute);
  424. }
  425. }
  426. $data[] = count($attributes) > 1 ? $elements : $elements[0];
  427. }
  428. return $data;
  429. }
  430. /**
  431. * Filters the list of nodes with an XPath expression.
  432. *
  433. * @param string $xpath An XPath expression
  434. *
  435. * @return Crawler A new instance of Crawler with the filtered list of nodes
  436. *
  437. * @api
  438. */
  439. public function filterXPath($xpath)
  440. {
  441. $document = new \DOMDocument('1.0', 'UTF-8');
  442. $root = $document->appendChild($document->createElement('_root'));
  443. foreach ($this as $node) {
  444. $root->appendChild($document->importNode($node, true));
  445. }
  446. $domxpath = new \DOMXPath($document);
  447. return new static($domxpath->query($xpath), $this->uri);
  448. }
  449. /**
  450. * Filters the list of nodes with a CSS selector.
  451. *
  452. * This method only works if you have installed the CssSelector Symfony Component.
  453. *
  454. * @param string $selector A CSS selector
  455. *
  456. * @return Crawler A new instance of Crawler with the filtered list of nodes
  457. *
  458. * @throws \RuntimeException if the CssSelector Component is not available
  459. *
  460. * @api
  461. */
  462. public function filter($selector)
  463. {
  464. if (!class_exists('Symfony\\Component\\CssSelector\\CssSelector')) {
  465. // @codeCoverageIgnoreStart
  466. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector is not installed (you can use filterXPath instead).');
  467. // @codeCoverageIgnoreEnd
  468. }
  469. return $this->filterXPath(CssSelector::toXPath($selector));
  470. }
  471. /**
  472. * Selects links by name or alt value for clickable images.
  473. *
  474. * @param string $value The link text
  475. *
  476. * @return Crawler A new instance of Crawler with the filtered list of nodes
  477. *
  478. * @api
  479. */
  480. public function selectLink($value)
  481. {
  482. $xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
  483. sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
  484. return $this->filterXPath($xpath);
  485. }
  486. /**
  487. * Selects a button by name or alt value for images.
  488. *
  489. * @param string $value The button text
  490. *
  491. * @return Crawler A new instance of Crawler with the filtered list of nodes
  492. *
  493. * @api
  494. */
  495. public function selectButton($value)
  496. {
  497. $xpath = sprintf('//input[((@type="submit" or @type="button") and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', static::xpathLiteral(' '.$value.' ')).
  498. sprintf('or (@type="image" and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', static::xpathLiteral(' '.$value.' '), $value, $value).
  499. sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
  500. return $this->filterXPath($xpath);
  501. }
  502. /**
  503. * Returns a Link object for the first node in the list.
  504. *
  505. * @param string $method The method for the link (get by default)
  506. *
  507. * @return Link A Link instance
  508. *
  509. * @throws \InvalidArgumentException If the current node list is empty
  510. *
  511. * @api
  512. */
  513. public function link($method = 'get')
  514. {
  515. if (!count($this)) {
  516. throw new \InvalidArgumentException('The current node list is empty.');
  517. }
  518. $node = $this->getNode(0);
  519. return new Link($node, $this->uri, $method);
  520. }
  521. /**
  522. * Returns an array of Link objects for the nodes in the list.
  523. *
  524. * @return array An array of Link instances
  525. *
  526. * @api
  527. */
  528. public function links()
  529. {
  530. $links = array();
  531. foreach ($this as $node) {
  532. $links[] = new Link($node, $this->uri, 'get');
  533. }
  534. return $links;
  535. }
  536. /**
  537. * Returns a Form object for the first node in the list.
  538. *
  539. * @param array $values An array of values for the form fields
  540. * @param string $method The method for the form
  541. *
  542. * @return Form A Form instance
  543. *
  544. * @throws \InvalidArgumentException If the current node list is empty
  545. *
  546. * @api
  547. */
  548. public function form(array $values = null, $method = null)
  549. {
  550. if (!count($this)) {
  551. throw new \InvalidArgumentException('The current node list is empty.');
  552. }
  553. $form = new Form($this->getNode(0), $this->uri, $method);
  554. if (null !== $values) {
  555. $form->setValues($values);
  556. }
  557. return $form;
  558. }
  559. /**
  560. * Converts string for XPath expressions.
  561. *
  562. * Escaped characters are: quotes (") and apostrophe (').
  563. *
  564. * Examples:
  565. * <code>
  566. * echo Crawler::xpathLiteral('foo " bar');
  567. * //prints 'foo " bar'
  568. *
  569. * echo Crawler::xpathLiteral("foo ' bar");
  570. * //prints "foo ' bar"
  571. *
  572. * echo Crawler::xpathLiteral('a\'b"c');
  573. * //prints concat('a', "'", 'b"c')
  574. * </code>
  575. *
  576. * @param string $s String to be escaped
  577. *
  578. * @return string Converted string
  579. *
  580. */
  581. public static function xpathLiteral($s)
  582. {
  583. if (false === strpos($s, "'")) {
  584. return sprintf("'%s'", $s);
  585. }
  586. if (false === strpos($s, '"')) {
  587. return sprintf('"%s"', $s);
  588. }
  589. $string = $s;
  590. $parts = array();
  591. while (true) {
  592. if (false !== $pos = strpos($string, "'")) {
  593. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  594. $parts[] = "\"'\"";
  595. $string = substr($string, $pos + 1);
  596. } else {
  597. $parts[] = "'$string'";
  598. break;
  599. }
  600. }
  601. return sprintf("concat(%s)", implode($parts, ', '));
  602. }
  603. private function getNode($position)
  604. {
  605. foreach ($this as $i => $node) {
  606. if ($i == $position) {
  607. return $node;
  608. }
  609. // @codeCoverageIgnoreStart
  610. }
  611. return null;
  612. // @codeCoverageIgnoreEnd
  613. }
  614. private function sibling($node, $siblingDir = 'nextSibling')
  615. {
  616. $nodes = array();
  617. do {
  618. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  619. $nodes[] = $node;
  620. }
  621. } while ($node = $node->$siblingDir);
  622. return $nodes;
  623. }
  624. }