Crawler.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\CssSelector;
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. *
  17. * @api
  18. */
  19. class Crawler extends \SplObjectStorage
  20. {
  21. private $uri;
  22. /**
  23. * Constructor.
  24. *
  25. * @param mixed $node A Node to use as the base for the crawling
  26. * @param string $uri The current URI or the base href value
  27. *
  28. * @api
  29. */
  30. public function __construct($node = null, $uri = null)
  31. {
  32. $this->uri = $uri;
  33. $this->add($node);
  34. }
  35. /**
  36. * Removes all the nodes.
  37. *
  38. * @api
  39. */
  40. public function clear()
  41. {
  42. $this->removeAll($this);
  43. }
  44. /**
  45. * Adds a node to the current list of nodes.
  46. *
  47. * This method uses the appropriate specialized add*() method based
  48. * on the type of the argument.
  49. *
  50. * @param null|\DOMNodeList|array|\DOMNode $node A node
  51. *
  52. * @api
  53. */
  54. public function add($node)
  55. {
  56. if ($node instanceof \DOMNodeList) {
  57. $this->addNodeList($node);
  58. } elseif (is_array($node)) {
  59. $this->addNodes($node);
  60. } elseif (is_string($node)) {
  61. $this->addContent($node);
  62. } elseif (is_object($node)) {
  63. $this->addNode($node);
  64. }
  65. }
  66. /**
  67. * Adds HTML/XML content.
  68. *
  69. * @param string $content A string to parse as HTML/XML
  70. * @param string $type The content type of the string
  71. */
  72. public function addContent($content, $type = null)
  73. {
  74. if (empty($type)) {
  75. $type = 'text/html';
  76. }
  77. // DOM only for HTML/XML content
  78. if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
  79. return null;
  80. }
  81. $charset = 'ISO-8859-1';
  82. if (false !== $pos = strpos($type, 'charset=')) {
  83. $charset = substr($type, $pos + 8);
  84. }
  85. if ('x' === $matches[1]) {
  86. $this->addXmlContent($content, $charset);
  87. } else {
  88. $this->addHtmlContent($content, $charset);
  89. }
  90. }
  91. /**
  92. * Adds an HTML content to the list of nodes.
  93. *
  94. * @param string $content The HTML content
  95. * @param string $charset The charset
  96. *
  97. * @api
  98. */
  99. public function addHtmlContent($content, $charset = 'UTF-8')
  100. {
  101. $dom = new \DOMDocument('1.0', $charset);
  102. $dom->validateOnParse = true;
  103. @$dom->loadHTML($content);
  104. $this->addDocument($dom);
  105. $base = $this->filter('base')->extract(array('href'));
  106. if (count($base)) {
  107. $this->uri = current($base);
  108. }
  109. }
  110. /**
  111. * Adds an XML content to the list of nodes.
  112. *
  113. * @param string $content The XML content
  114. * @param string $charset The charset
  115. *
  116. * @api
  117. */
  118. public function addXmlContent($content, $charset = 'UTF-8')
  119. {
  120. $dom = new \DOMDocument('1.0', $charset);
  121. $dom->validateOnParse = true;
  122. // remove the default namespace to make XPath expressions simpler
  123. @$dom->loadXML(str_replace('xmlns', 'ns', $content));
  124. $this->addDocument($dom);
  125. }
  126. /**
  127. * Adds a \DOMDocument to the list of nodes.
  128. *
  129. * @param \DOMDocument $dom A \DOMDocument instance
  130. *
  131. * @api
  132. */
  133. public function addDocument(\DOMDocument $dom)
  134. {
  135. if ($dom->documentElement) {
  136. $this->addNode($dom->documentElement);
  137. }
  138. }
  139. /**
  140. * Adds a \DOMNodeList to the list of nodes.
  141. *
  142. * @param \DOMNodeList $nodes A \DOMNodeList instance
  143. *
  144. * @api
  145. */
  146. public function addNodeList(\DOMNodeList $nodes)
  147. {
  148. foreach ($nodes as $node) {
  149. $this->addNode($node);
  150. }
  151. }
  152. /**
  153. * Adds an array of \DOMNode instances to the list of nodes.
  154. *
  155. * @param array $nodes An array of \DOMNode instances
  156. *
  157. * @api
  158. */
  159. public function addNodes(array $nodes)
  160. {
  161. foreach ($nodes as $node) {
  162. $this->add($node);
  163. }
  164. }
  165. /**
  166. * Adds a \DOMNode instance to the list of nodes.
  167. *
  168. * @param \DOMNode $node A \DOMNode instance
  169. *
  170. * @api
  171. */
  172. public function addNode(\DOMNode $node)
  173. {
  174. if ($node instanceof \DOMDocument) {
  175. $this->attach($node->documentElement);
  176. } else {
  177. $this->attach($node);
  178. }
  179. }
  180. /**
  181. * Returns a node given its position in the node list.
  182. *
  183. * @param integer $position The position
  184. *
  185. * @return A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist.
  186. *
  187. * @api
  188. */
  189. public function eq($position)
  190. {
  191. foreach ($this as $i => $node) {
  192. if ($i == $position) {
  193. return new static($node, $this->uri);
  194. }
  195. }
  196. return new static(null, $this->uri);
  197. }
  198. /**
  199. * Calls an anonymous function on each node of the list.
  200. *
  201. * The anonymous function receives the position and the node as arguments.
  202. *
  203. * Example:
  204. *
  205. * $crawler->filter('h1')->each(function ($node, $i)
  206. * {
  207. * return $node->nodeValue;
  208. * });
  209. *
  210. * @param \Closure $closure An anonymous function
  211. *
  212. * @return array An array of values returned by the anonymous function
  213. *
  214. * @api
  215. */
  216. public function each(\Closure $closure)
  217. {
  218. $data = array();
  219. foreach ($this as $i => $node) {
  220. $data[] = $closure($node, $i);
  221. }
  222. return $data;
  223. }
  224. /**
  225. * Reduces the list of nodes by calling an anonymous function.
  226. *
  227. * To remove a node from the list, the anonymous function must return false.
  228. *
  229. * @param \Closure $closure An anonymous function
  230. *
  231. * @return Crawler A Crawler instance with the selected nodes.
  232. *
  233. * @api
  234. */
  235. public function reduce(\Closure $closure)
  236. {
  237. $nodes = array();
  238. foreach ($this as $i => $node) {
  239. if (false !== $closure($node, $i)) {
  240. $nodes[] = $node;
  241. }
  242. }
  243. return new static($nodes, $this->uri);
  244. }
  245. /**
  246. * Returns the first node of the current selection
  247. *
  248. * @return Crawler A Crawler instance with the first selected node
  249. *
  250. * @api
  251. */
  252. public function first()
  253. {
  254. return $this->eq(0);
  255. }
  256. /**
  257. * Returns the last node of the current selection
  258. *
  259. * @return Crawler A Crawler instance with the last selected node
  260. *
  261. * @api
  262. */
  263. public function last()
  264. {
  265. return $this->eq(count($this) - 1);
  266. }
  267. /**
  268. * Returns the siblings nodes of the current selection
  269. *
  270. * @return Crawler A Crawler instance with the sibling nodes
  271. *
  272. * @throws \InvalidArgumentException When current node is empty
  273. *
  274. * @api
  275. */
  276. public function siblings()
  277. {
  278. if (!count($this)) {
  279. throw new \InvalidArgumentException('The current node list is empty.');
  280. }
  281. return new static($this->sibling($this->getNode(0)->parentNode->firstChild), $this->uri);
  282. }
  283. /**
  284. * Returns the next siblings nodes of the current selection
  285. *
  286. * @return Crawler A Crawler instance with the next sibling nodes
  287. *
  288. * @throws \InvalidArgumentException When current node is empty
  289. *
  290. * @api
  291. */
  292. public function nextAll()
  293. {
  294. if (!count($this)) {
  295. throw new \InvalidArgumentException('The current node list is empty.');
  296. }
  297. return new static($this->sibling($this->getNode(0)), $this->uri);
  298. }
  299. /**
  300. * Returns the previous sibling nodes of the current selection
  301. *
  302. * @return Crawler A Crawler instance with the previous sibling nodes
  303. *
  304. * @api
  305. */
  306. public function previousAll()
  307. {
  308. if (!count($this)) {
  309. throw new \InvalidArgumentException('The current node list is empty.');
  310. }
  311. return new static($this->sibling($this->getNode(0), 'previousSibling'), $this->uri);
  312. }
  313. /**
  314. * Returns the parents nodes of the current selection
  315. *
  316. * @return Crawler A Crawler instance with the parents nodes of the current selection
  317. *
  318. * @throws \InvalidArgumentException When current node is empty
  319. *
  320. * @api
  321. */
  322. public function parents()
  323. {
  324. if (!count($this)) {
  325. throw new \InvalidArgumentException('The current node list is empty.');
  326. }
  327. $node = $this->getNode(0);
  328. $nodes = array();
  329. while ($node = $node->parentNode) {
  330. if (1 === $node->nodeType && '_root' !== $node->nodeName) {
  331. $nodes[] = $node;
  332. }
  333. }
  334. return new static($nodes, $this->uri);
  335. }
  336. /**
  337. * Returns the children nodes of the current selection
  338. *
  339. * @return Crawler A Crawler instance with the children nodes
  340. *
  341. * @throws \InvalidArgumentException When current node is empty
  342. *
  343. * @api
  344. */
  345. public function children()
  346. {
  347. if (!count($this)) {
  348. throw new \InvalidArgumentException('The current node list is empty.');
  349. }
  350. return new static($this->sibling($this->getNode(0)->firstChild), $this->uri);
  351. }
  352. /**
  353. * Returns the attribute value of the first node of the list.
  354. *
  355. * @param string $attribute The attribute name
  356. *
  357. * @return string The attribute value
  358. *
  359. * @throws \InvalidArgumentException When current node is empty
  360. *
  361. * @api
  362. */
  363. public function attr($attribute)
  364. {
  365. if (!count($this)) {
  366. throw new \InvalidArgumentException('The current node list is empty.');
  367. }
  368. return $this->getNode(0)->getAttribute($attribute);
  369. }
  370. /**
  371. * Returns the node value of the first node of the list.
  372. *
  373. * @return string The node value
  374. *
  375. * @throws \InvalidArgumentException When current node is empty
  376. *
  377. * @api
  378. */
  379. public function text()
  380. {
  381. if (!count($this)) {
  382. throw new \InvalidArgumentException('The current node list is empty.');
  383. }
  384. return $this->getNode(0)->nodeValue;
  385. }
  386. /**
  387. * Extracts information from the list of nodes.
  388. *
  389. * You can extract attributes or/and the node value (_text).
  390. *
  391. * Example:
  392. *
  393. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  394. *
  395. * @param array $attributes An array of attributes
  396. *
  397. * @return array An array of extracted values
  398. *
  399. * @api
  400. */
  401. public function extract($attributes)
  402. {
  403. $attributes = (array) $attributes;
  404. $data = array();
  405. foreach ($this as $node) {
  406. $elements = array();
  407. foreach ($attributes as $attribute) {
  408. if ('_text' === $attribute) {
  409. $elements[] = $node->nodeValue;
  410. } else {
  411. $elements[] = $node->getAttribute($attribute);
  412. }
  413. }
  414. $data[] = count($attributes) > 1 ? $elements : $elements[0];
  415. }
  416. return $data;
  417. }
  418. /**
  419. * Filters the list of nodes with an XPath expression.
  420. *
  421. * @param string $xpath An XPath expression
  422. *
  423. * @return Crawler A new instance of Crawler with the filtered list of nodes
  424. *
  425. * @api
  426. */
  427. public function filterXPath($xpath)
  428. {
  429. $document = new \DOMDocument('1.0', 'UTF-8');
  430. $root = $document->appendChild($document->createElement('_root'));
  431. foreach ($this as $node) {
  432. $root->appendChild($document->importNode($node, true));
  433. }
  434. $domxpath = new \DOMXPath($document);
  435. return new static($domxpath->query($xpath), $this->uri);
  436. }
  437. /**
  438. * Filters the list of nodes with a CSS selector.
  439. *
  440. * This method only works if you have installed the CssSelector Symfony Component.
  441. *
  442. * @param string $selector A CSS selector
  443. *
  444. * @return Crawler A new instance of Crawler with the filtered list of nodes
  445. *
  446. * @throws \RuntimeException if the CssSelector Component is not available
  447. *
  448. * @api
  449. */
  450. public function filter($selector)
  451. {
  452. if (!class_exists('Symfony\\Component\\CssSelector\\CssSelector')) {
  453. // @codeCoverageIgnoreStart
  454. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector is not installed (you can use filterXPath instead).');
  455. // @codeCoverageIgnoreEnd
  456. }
  457. return $this->filterXPath(CssSelector::toXPath($selector));
  458. }
  459. /**
  460. * Selects links by name or alt value for clickable images.
  461. *
  462. * @param string $value The link text
  463. *
  464. * @return Crawler A new instance of Crawler with the filtered list of nodes
  465. *
  466. * @api
  467. */
  468. public function selectLink($value)
  469. {
  470. $xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
  471. sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
  472. return $this->filterXPath($xpath);
  473. }
  474. /**
  475. * Selects a button by name or alt value for images.
  476. *
  477. * @param string $value The button text
  478. *
  479. * @return Crawler A new instance of Crawler with the filtered list of nodes
  480. *
  481. * @api
  482. */
  483. public function selectButton($value)
  484. {
  485. $xpath = sprintf('//input[((@type="submit" or @type="button") and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', static::xpathLiteral(' '.$value.' ')).
  486. sprintf('or (@type="image" and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', static::xpathLiteral(' '.$value.' '), $value, $value).
  487. sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
  488. return $this->filterXPath($xpath);
  489. }
  490. /**
  491. * Returns a Link object for the first node in the list.
  492. *
  493. * @param string $method The method for the link (get by default)
  494. *
  495. * @return Link A Link instance
  496. *
  497. * @throws \InvalidArgumentException If the current node list is empty
  498. *
  499. * @api
  500. */
  501. public function link($method = 'get')
  502. {
  503. if (!count($this)) {
  504. throw new \InvalidArgumentException('The current node list is empty.');
  505. }
  506. $node = $this->getNode(0);
  507. return new Link($node, $this->uri, $method);
  508. }
  509. /**
  510. * Returns an array of Link objects for the nodes in the list.
  511. *
  512. * @return array An array of Link instances
  513. *
  514. * @api
  515. */
  516. public function links()
  517. {
  518. $links = array();
  519. foreach ($this as $node) {
  520. $links[] = new Link($node, $this->uri, 'get');
  521. }
  522. return $links;
  523. }
  524. /**
  525. * Returns a Form object for the first node in the list.
  526. *
  527. * @param array $values An array of values for the form fields
  528. * @param string $method The method for the form
  529. *
  530. * @return Form A Form instance
  531. *
  532. * @throws \InvalidArgumentException If the current node list is empty
  533. *
  534. * @api
  535. */
  536. public function form(array $values = null, $method = null)
  537. {
  538. if (!count($this)) {
  539. throw new \InvalidArgumentException('The current node list is empty.');
  540. }
  541. $form = new Form($this->getNode(0), $this->uri, $method);
  542. if (null !== $values) {
  543. $form->setValues($values);
  544. }
  545. return $form;
  546. }
  547. static public function xpathLiteral($s)
  548. {
  549. if (false === strpos($s, "'")) {
  550. return sprintf("'%s'", $s);
  551. }
  552. if (false === strpos($s, '"')) {
  553. return sprintf('"%s"', $s);
  554. }
  555. $string = $s;
  556. $parts = array();
  557. while (true) {
  558. if (false !== $pos = strpos($string, "'")) {
  559. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  560. $parts[] = "\"'\"";
  561. $string = substr($string, $pos + 1);
  562. } else {
  563. $parts[] = "'$string'";
  564. break;
  565. }
  566. }
  567. return sprintf("concat(%s)", implode($parts, ', '));
  568. }
  569. private function getNode($position)
  570. {
  571. foreach ($this as $i => $node) {
  572. if ($i == $position) {
  573. return $node;
  574. }
  575. // @codeCoverageIgnoreStart
  576. }
  577. return null;
  578. // @codeCoverageIgnoreEnd
  579. }
  580. private function sibling($node, $siblingDir = 'nextSibling')
  581. {
  582. $nodes = array();
  583. do {
  584. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  585. $nodes[] = $node;
  586. }
  587. } while ($node = $node->$siblingDir);
  588. return $nodes;
  589. }
  590. }