Crawler.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\Parser as CssParser;
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. */
  17. class Crawler extends \SplObjectStorage
  18. {
  19. private $uri;
  20. private $host;
  21. private $path;
  22. private $base;
  23. /**
  24. * Constructor.
  25. *
  26. * @param mixed $node A Node to use as the base for the crawling
  27. * @param string $uri The base URI to use for absolute links or form actions
  28. * @param string $base An optional base href for generating the uris for Form and Link.
  29. * This will be autodetected if $node has a <base> tag.
  30. */
  31. public function __construct($node = null, $uri = null, $base = null)
  32. {
  33. $this->uri = $uri;
  34. list($this->host, $this->path) = $this->parseUri($this->uri);
  35. $this->add($node);
  36. if ($base) {
  37. $this->base = $base;
  38. }
  39. }
  40. /**
  41. * Removes all the nodes.
  42. */
  43. public function clear()
  44. {
  45. $this->removeAll($this);
  46. }
  47. /**
  48. * Adds a node to the current list of nodes.
  49. *
  50. * This method uses the appropriate specialized add*() method based
  51. * on the type of the argument.
  52. *
  53. * @param null|\DOMNodeList|array|\DOMNode $node A node
  54. */
  55. public function add($node)
  56. {
  57. if ($node instanceof \DOMNodeList) {
  58. $this->addNodeList($node);
  59. } elseif (is_array($node)) {
  60. $this->addNodes($node);
  61. } elseif (is_string($node)) {
  62. $this->addContent($node);
  63. } elseif (is_object($node)) {
  64. $this->addNode($node);
  65. }
  66. }
  67. public function addContent($content, $type = null)
  68. {
  69. if (empty($type)) {
  70. $type = 'text/html';
  71. }
  72. // DOM only for HTML/XML content
  73. if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
  74. return null;
  75. }
  76. $charset = 'ISO-8859-1';
  77. if (false !== $pos = strpos($type, 'charset=')) {
  78. $charset = substr($type, $pos + 8);
  79. }
  80. if ('x' === $matches[1]) {
  81. $this->addXmlContent($content, $charset);
  82. } else {
  83. $this->addHtmlContent($content, $charset);
  84. }
  85. }
  86. /**
  87. * Adds an HTML content to the list of nodes.
  88. *
  89. * @param string $content The HTML content
  90. * @param string $charset The charset
  91. */
  92. public function addHtmlContent($content, $charset = 'UTF-8')
  93. {
  94. $dom = new \DOMDocument('1.0', $charset);
  95. $dom->validateOnParse = true;
  96. @$dom->loadHTML($content);
  97. $this->addDocument($dom);
  98. $base = $this->filter('base')->extract(array('href'));
  99. if (count($base)) {
  100. $this->base = current($base);
  101. }
  102. }
  103. /**
  104. * Adds an XML content to the list of nodes.
  105. *
  106. * @param string $content The XML content
  107. * @param string $charset The charset
  108. */
  109. public function addXmlContent($content, $charset = 'UTF-8')
  110. {
  111. $dom = new \DOMDocument('1.0', $charset);
  112. $dom->validateOnParse = true;
  113. // remove the default namespace to make XPath expressions simpler
  114. @$dom->loadXML(str_replace('xmlns', 'ns', $content));
  115. $this->addDocument($dom);
  116. }
  117. /**
  118. * Adds a \DOMDocument to the list of nodes.
  119. *
  120. * @param \DOMDocument $dom A \DOMDocument instance
  121. */
  122. public function addDocument(\DOMDocument $dom)
  123. {
  124. if ($dom->documentElement) {
  125. $this->addNode($dom->documentElement);
  126. }
  127. }
  128. /**
  129. * Adds a \DOMNodeList to the list of nodes.
  130. *
  131. * @param \DOMNodeList $nodes A \DOMNodeList instance
  132. */
  133. public function addNodeList(\DOMNodeList $nodes)
  134. {
  135. foreach ($nodes as $node) {
  136. $this->addNode($node);
  137. }
  138. }
  139. /**
  140. * Adds an array of \DOMNode instances to the list of nodes.
  141. *
  142. * @param array $nodes An array of \DOMNode instances
  143. */
  144. public function addNodes(array $nodes)
  145. {
  146. foreach ($nodes as $node) {
  147. $this->add($node);
  148. }
  149. }
  150. /**
  151. * Adds a \DOMNode instance to the list of nodes.
  152. *
  153. * @param \DOMNode $node A \DOMNode instance
  154. */
  155. public function addNode(\DOMNode $node)
  156. {
  157. if ($node instanceof \DOMDocument) {
  158. $this->attach($node->documentElement);
  159. } else {
  160. $this->attach($node);
  161. }
  162. }
  163. /**
  164. * Returns a node given its position in the node list.
  165. *
  166. * @param integer $position The position
  167. *
  168. * @return A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist.
  169. */
  170. public function eq($position)
  171. {
  172. foreach ($this as $i => $node) {
  173. if ($i == $position) {
  174. return new static($node, $this->uri);
  175. }
  176. }
  177. return new static(null, $this->uri);
  178. }
  179. /**
  180. * Calls an anonymous function on each node of the list.
  181. *
  182. * The anonymous function receives the position and the node as arguments.
  183. *
  184. * Example:
  185. *
  186. * $crawler->filter('h1')->each(function ($i, $node)
  187. * {
  188. * return $node->nodeValue;
  189. * });
  190. *
  191. * @param \Closure $closure An anonymous function
  192. *
  193. * @return array An array of values returned by the anonymous function
  194. */
  195. public function each(\Closure $closure)
  196. {
  197. $data = array();
  198. foreach ($this as $i => $node) {
  199. $data[] = $closure($node, $i);
  200. }
  201. return $data;
  202. }
  203. /**
  204. * Reduces the list of nodes by calling an anonymous function.
  205. *
  206. * To remove a node from the list, the anonymous function must return false.
  207. *
  208. * @param \Closure $closure An anonymous function
  209. *
  210. * @return Crawler A Crawler instance with the selected nodes.
  211. */
  212. public function reduce(\Closure $closure)
  213. {
  214. $nodes = array();
  215. foreach ($this as $i => $node) {
  216. if (false !== $closure($node, $i)) {
  217. $nodes[] = $node;
  218. }
  219. }
  220. return new static($nodes, $this->uri);
  221. }
  222. /**
  223. * Returns the first node of the current selection
  224. *
  225. * @return Crawler A Crawler instance with the first selected node
  226. */
  227. public function first()
  228. {
  229. return $this->eq(0);
  230. }
  231. /**
  232. * Returns the last node of the current selection
  233. *
  234. * @return Crawler A Crawler instance with the last selected node
  235. */
  236. public function last()
  237. {
  238. return $this->eq(count($this) - 1);
  239. }
  240. /**
  241. * Returns the siblings nodes of the current selection
  242. *
  243. * @return Crawler A Crawler instance with the sibling nodes
  244. *
  245. * @throws \InvalidArgumentException When current node is empty
  246. */
  247. public function siblings()
  248. {
  249. if (!count($this)) {
  250. throw new \InvalidArgumentException('The current node list is empty.');
  251. }
  252. return new static($this->sibling($this->getNode(0)->parentNode->firstChild), $this->uri);
  253. }
  254. /**
  255. * Returns the next siblings nodes of the current selection
  256. *
  257. * @return Crawler A Crawler instance with the next sibling nodes
  258. *
  259. * @throws \InvalidArgumentException When current node is empty
  260. */
  261. public function nextAll()
  262. {
  263. if (!count($this)) {
  264. throw new \InvalidArgumentException('The current node list is empty.');
  265. }
  266. return new static($this->sibling($this->getNode(0)), $this->uri);
  267. }
  268. /**
  269. * Returns the previous sibling nodes of the current selection
  270. *
  271. * @return Crawler A Crawler instance with the previous sibling nodes
  272. */
  273. public function previousAll()
  274. {
  275. if (!count($this)) {
  276. throw new \InvalidArgumentException('The current node list is empty.');
  277. }
  278. return new static($this->sibling($this->getNode(0), 'previousSibling'), $this->uri);
  279. }
  280. /**
  281. * Returns the parents nodes of the current selection
  282. *
  283. * @return Crawler A Crawler instance with the parents nodes of the current selection
  284. *
  285. * @throws \InvalidArgumentException When current node is empty
  286. */
  287. public function parents()
  288. {
  289. if (!count($this)) {
  290. throw new \InvalidArgumentException('The current node list is empty.');
  291. }
  292. $node = $this->getNode(0);
  293. $nodes = array();
  294. while ($node = $node->parentNode) {
  295. if (1 === $node->nodeType && '_root' !== $node->nodeName) {
  296. $nodes[] = $node;
  297. }
  298. }
  299. return new static($nodes, $this->uri);
  300. }
  301. /**
  302. * Returns the children nodes of the current selection
  303. *
  304. * @return Crawler A Crawler instance with the children nodes
  305. *
  306. * @throws \InvalidArgumentException When current node is empty
  307. */
  308. public function children()
  309. {
  310. if (!count($this)) {
  311. throw new \InvalidArgumentException('The current node list is empty.');
  312. }
  313. return new static($this->sibling($this->getNode(0)->firstChild), $this->uri);
  314. }
  315. /**
  316. * Returns the attribute value of the first node of the list.
  317. *
  318. * @param string $attribute The attribute name
  319. *
  320. * @return string The attribute value
  321. *
  322. * @throws \InvalidArgumentException When current node is empty
  323. */
  324. public function attr($attribute)
  325. {
  326. if (!count($this)) {
  327. throw new \InvalidArgumentException('The current node list is empty.');
  328. }
  329. return $this->getNode(0)->getAttribute($attribute);
  330. }
  331. /**
  332. * Returns the node value of the first node of the list.
  333. *
  334. * @return string The node value
  335. *
  336. * @throws \InvalidArgumentException When current node is empty
  337. */
  338. public function text()
  339. {
  340. if (!count($this)) {
  341. throw new \InvalidArgumentException('The current node list is empty.');
  342. }
  343. return $this->getNode(0)->nodeValue;
  344. }
  345. /**
  346. * Extracts information from the list of nodes.
  347. *
  348. * You can extract attributes or/and the node value (_text).
  349. *
  350. * Example:
  351. *
  352. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  353. *
  354. * @param array $attributes An array of attributes
  355. *
  356. * @return array An array of extracted values
  357. */
  358. public function extract($attributes)
  359. {
  360. if (!is_array($attributes)) {
  361. $attributes = array($attributes);
  362. }
  363. $data = array();
  364. foreach ($this as $node) {
  365. $elements = array();
  366. foreach ($attributes as $attribute) {
  367. if ('_text' === $attribute) {
  368. $elements[] = $node->nodeValue;
  369. } else {
  370. $elements[] = $node->getAttribute($attribute);
  371. }
  372. }
  373. $data[] = count($attributes) > 1 ? $elements : $elements[0];
  374. }
  375. return $data;
  376. }
  377. /**
  378. * Filters the list of nodes with an XPath expression.
  379. *
  380. * @param string $xpath An XPath expression
  381. *
  382. * @return Crawler A new instance of Crawler with the filtered list of nodes
  383. */
  384. public function filterXPath($xpath)
  385. {
  386. $document = new \DOMDocument('1.0', 'UTF-8');
  387. $root = $document->appendChild($document->createElement('_root'));
  388. foreach ($this as $node) {
  389. $root->appendChild($document->importNode($node, true));
  390. }
  391. $domxpath = new \DOMXPath($document);
  392. return new static($domxpath->query($xpath), $this->uri, $this->base);
  393. }
  394. /**
  395. * Filters the list of nodes with a CSS selector.
  396. *
  397. * This method only works if you have installed the CssSelector Symfony Component.
  398. *
  399. * @param string $selector A CSS selector
  400. *
  401. * @return Crawler A new instance of Crawler with the filtered list of nodes
  402. *
  403. * @throws \RuntimeException if the CssSelector Component is not available
  404. */
  405. public function filter($selector)
  406. {
  407. if (!class_exists('Symfony\\Component\\CssSelector\\Parser')) {
  408. // @codeCoverageIgnoreStart
  409. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector is not installed (you can use filterXPath instead).');
  410. // @codeCoverageIgnoreEnd
  411. }
  412. return $this->filterXPath(CssParser::cssToXpath($selector));
  413. }
  414. /**
  415. * Selects links by name or alt value for clickable images.
  416. *
  417. * @param string $value The link text
  418. *
  419. * @return Crawler A new instance of Crawler with the filtered list of nodes
  420. */
  421. public function selectLink($value)
  422. {
  423. $xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
  424. sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
  425. return $this->filterXPath($xpath);
  426. }
  427. /**
  428. * Selects a button by name or alt value for images.
  429. *
  430. * @param string $value The button text
  431. *
  432. * @return Crawler A new instance of Crawler with the filtered list of nodes
  433. */
  434. public function selectButton($value)
  435. {
  436. $xpath = sprintf('//input[((@type="submit" or @type="button") and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', static::xpathLiteral(' '.$value.' ')).
  437. sprintf('or (@type="image" and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', static::xpathLiteral(' '.$value.' '), $value, $value).
  438. sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
  439. return $this->filterXPath($xpath);
  440. }
  441. /**
  442. * Returns a Link object for the first node in the list.
  443. *
  444. * @param string $method The method for the link (get by default)
  445. *
  446. * @return Link A Link instance
  447. *
  448. * @throws \InvalidArgumentException If the current node list is empty
  449. */
  450. public function link($method = 'get')
  451. {
  452. if (!count($this)) {
  453. throw new \InvalidArgumentException('The current node list is empty.');
  454. }
  455. $node = $this->getNode(0);
  456. return new Link($node, $method, $this->host, $this->path, $this->base);
  457. }
  458. /**
  459. * Returns an array of Link objects for the nodes in the list.
  460. *
  461. * @return array An array of Link instances
  462. */
  463. public function links()
  464. {
  465. $links = array();
  466. foreach ($this as $node) {
  467. $links[] = new Link($node, 'get', $this->host, $this->path);
  468. }
  469. return $links;
  470. }
  471. /**
  472. * Returns a Form object for the first node in the list.
  473. *
  474. * @param array $arguments An array of values for the form fields
  475. * @param string $method The method for the form
  476. *
  477. * @return Form A Form instance
  478. *
  479. * @throws \InvalidArgumentException If the current node list is empty
  480. */
  481. public function form(array $values = null, $method = null)
  482. {
  483. if (!count($this)) {
  484. throw new \InvalidArgumentException('The current node list is empty.');
  485. }
  486. $form = new Form($this->getNode(0), $method, $this->host, $this->path, $this->base);
  487. if (null !== $values) {
  488. $form->setValues($values);
  489. }
  490. return $form;
  491. }
  492. static public function xpathLiteral($s)
  493. {
  494. if (false === strpos($s, "'")) {
  495. return sprintf("'%s'", $s);
  496. }
  497. if (false === strpos($s, '"')) {
  498. return sprintf('"%s"', $s);
  499. }
  500. $string = $s;
  501. $parts = array();
  502. while (true) {
  503. if (false !== $pos = strpos($string, "'")) {
  504. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  505. $parts[] = "\"'\"";
  506. $string = substr($string, $pos + 1);
  507. } else {
  508. $parts[] = "'$string'";
  509. break;
  510. }
  511. }
  512. return sprintf("concat(%s)", implode($parts, ', '));
  513. }
  514. private function getNode($position)
  515. {
  516. foreach ($this as $i => $node) {
  517. if ($i == $position) {
  518. return $node;
  519. }
  520. // @codeCoverageIgnoreStart
  521. }
  522. return null;
  523. // @codeCoverageIgnoreEnd
  524. }
  525. private function parseUri($uri)
  526. {
  527. if ('http' !== substr($uri, 0, 4)) {
  528. return array(null, '/');
  529. }
  530. $path = parse_url($uri, PHP_URL_PATH);
  531. return array(preg_replace('#^(.*?//[^/]+)\/.*$#', '$1', $uri), $path);
  532. }
  533. private function sibling($node, $siblingDir = 'nextSibling')
  534. {
  535. $nodes = array();
  536. do {
  537. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  538. $nodes[] = $node;
  539. }
  540. } while ($node = $node->$siblingDir);
  541. return $nodes;
  542. }
  543. }