How does Goutte get the url in the a tag? Or a useful PHP crawler library, thank you
<code><?php require('./Vendor/autoload.php'); use Goutte\Client; /** * */ class Spider { private $_client; private $_crawler; public $_news = [ 'title' => [], 'link' => [], 'content' => [], 'source' => [], 'date' => [], ]; public function __construct() { try { $this->_client = new Client(); $this->_crawler = $this->_client->request('GET', 'http://www.ningshan.gov.cn/Category_90/Index.aspx'); // $client->getClient()->setDefaultOption('config/curl/'.CURLOPT_TIMEOUT, 10); } catch (Exception $e) { throw new \Exception($e->getMessage(), 1); } } public function getDate() { $this->_crawler->filter('div#list>ul>li>span')->each(function ($node) { $this->_news['date'][] = $node->text(); }); } public function getTitle() { $link = $this->_crawler->selectLink('宁陕县召开政协八届二十二次次常委会')->link(); var_dump($link->getUri);die; $this->_crawler->filter('div#list>ul>li>a')->each(function ($node) { if ($node->text() !== '宁陕要闻') { $this->_news['title'][] = $node->text(); $this->_news['link'][] = $node->link(); $this->_news['source'][] = '宁陕要闻'; } }); } } //----------------------------------- try { $spider = new Spider(); $spider->getDate(); $spider->getTitle(); echo json_encode($spider->_news, JSON_UNESCAPED_UNICODE); } catch (Exception $e) { echo $e->getMessage(); } </code>
How does Goutte get the url in the a tag? Or a useful PHP crawler library, thank you
<code><?php require('./Vendor/autoload.php'); use Goutte\Client; /** * */ class Spider { private $_client; private $_crawler; public $_news = [ 'title' => [], 'link' => [], 'content' => [], 'source' => [], 'date' => [], ]; public function __construct() { try { $this->_client = new Client(); $this->_crawler = $this->_client->request('GET', 'http://www.ningshan.gov.cn/Category_90/Index.aspx'); // $client->getClient()->setDefaultOption('config/curl/'.CURLOPT_TIMEOUT, 10); } catch (Exception $e) { throw new \Exception($e->getMessage(), 1); } } public function getDate() { $this->_crawler->filter('div#list>ul>li>span')->each(function ($node) { $this->_news['date'][] = $node->text(); }); } public function getTitle() { $link = $this->_crawler->selectLink('宁陕县召开政协八届二十二次次常委会')->link(); var_dump($link->getUri);die; $this->_crawler->filter('div#list>ul>li>a')->each(function ($node) { if ($node->text() !== '宁陕要闻') { $this->_news['title'][] = $node->text(); $this->_news['link'][] = $node->link(); $this->_news['source'][] = '宁陕要闻'; } }); } } //----------------------------------- try { $spider = new Spider(); $spider->getDate(); $spider->getTitle(); echo json_encode($spider->_news, JSON_UNESCAPED_UNICODE); } catch (Exception $e) { echo $e->getMessage(); } </code>
Looking for it now
<code>$crawler = $client->request('GET', 'http://www.symfony.com/blog/'); $link = $crawler->selectLink('Security Advisories')->link(); print_r($link->getUri());</code>
Manual: http://symfony.com/doc/curren...
GIT: https://github.com/FriendsOfP...
Collection reference: http://flc.ren/2016/06/528.html