Code source: jUnion
Applicable platforms: Windows, Linux (Ubuntu), php-5.2.5+, Apache
Function: Capture pictures of the entire site. Currently, there is no curl plug-in development using PHP. It will be improved later
Configuration: config directory
domain_name: domain name (default: bizhibar.com)
request_site: website URL (default: http://www.bizhibar.com/)
request_url: Which page of the website to start from (default: http://www.bizhibar.com/)
Accept_type: Image type (default: gif, bmp, png, ico, jpg, jpeg)
Save_path: Picture saving path (default: savefiles/)
Partition_name: Image saving directory name prefix (default: img_)
dir_file_limit: How many files each directory allows (default: 100)
Serialize_img_size: How many image addresses are read before they are cached in the accompImg file in the cache directory. These addresses will be ignored the next time you continue to crawl. (Default: 30)
Serialize_url_size: Same as serialize_url_size, how many link addresses have been read before caching to the cache directory
The overURL under the URL will be ignored when crawling next time. (Default: 10)
Note: I welcome your criticism and advice. If you have any new questions or areas that need improvement, please give me feedback
<?php set_time_limit(0); require dirname(__FILE__).DIRECTORY_SEPARATOR.'include'.DIRECTORY_SEPARATOR.'Capture.const.php'; require __Home__.'include'.__Os__.'Capture.class.php'; $_cfg = array( 'site' => __Home__.'config'.__Os__.'capture.site.php', 'preg' => __Home__.'config'.__Os__.'capture.preg.php', 'accompImg' => __Home__.'cache'.__Os__.'accompImg', 'overURL' => __Home__.'cache'.__Os__.'overURL' ); $_parse = new Capture( $_cfg ); $_parse->parseQuestUrl(); ?>
<?php /** * The main class * @author pankai<530911044@qq.com> * @date 2013-08-10 */ class Capture { private static $_Config = array(); private static $_CapSite = NULL; private static $_CapPreg = NULL; private static $_overURL = array(); private $_mark = FALSE; private static $_markTime = 1; /** * initialize the main class: Capture * @param $_cfg array */ public function __construct( &$_cfg ) { self::$_Config = &$_cfg; self::$_CapSite = require $_cfg['site']; self::$_CapPreg = require $_cfg['preg']; foreach( self::$_CapPreg as $_key => $_value ) { self::$_CapPreg[$_key] = str_replace( '_request_site', self::$_CapSite['request_site'], $_value ); } self::import( 'file.OperateFile' ); if( file_exists( $_cfg['overURL'] ) && filesize( $_cfg['overURL'] ) > 0 ) { $_contents = OperateFile::readText( $_cfg['overURL'], filesize( $_cfg['overURL'] ) ); self::$_overURL = unserialize( $_contents ); } self::import('pivotal.Pivotal'); if( file_exists( $_cfg['accompImg'] ) && filesize( $_cfg['accompImg'] ) > 0 ) { $_contents = OperateFile::readText( $_cfg['accompImg'], filesize( $_cfg['accompImg'] ) ); Pivotal::$_accompImg = unserialize( $_contents ); } } /** * load class, follow Java pragrammer(package): import com.jUnion.Capture * @param $_class */ public static function import( $_class ) { require_once __Home__.'include'.__Os__.str_replace( '.', __Os__, $_class ).'.class.php'; } /** * create an instance of Pivotal class * @param $_source */ private function getCapInstance( &$_source ) { $this->_mark = FALSE; $_Captal = new Pivotal( self::$_Config, $_source ); $_tagA = $_Captal->parseUrl(); $this->_mark = TRUE; return $_tagA; } /** * go forward one by one * @param $_tagArr */ private function roundTagA( &$_tagArr ) { if( $_tagArr == NULL ) { return; } $_tagArrLength = count( $_tagArr ); for( $i = 0; $i < $_tagArrLength; $i ++ ) { if( is_array( $_tagArr[ $i ] ) ) { $this->roundTagA( $_tagArr[ $i ] ); } else { if( stripos( $_tagArr[$i], self::$_CapSite['domain_name'] ) === FALSE ) { continue; } if( in_array( $_tagArr[$i], self::$_overURL ) ) { continue; } self::$_overURL[] = $_tagArr[$i]; if( count( self::$_overURL ) % self::$_CapSite['serialize_url_size'] == 0 ) { OperateFile::setText( self::$_Config['overURL'], serialize( self::$_overURL ) ); } do { $_tagA = $this->getCapInstance( Http::get( $_tagArr[$i] ) ); sleep( self::$_CapSite['preform_page_time'] * self::$_markTime ); if( $this->_mark === TRUE ) { self::$_markTime = self::$_CapSite['preform_page_time']; break; } self::$_markTime *= 2; } while( true ); /* parse the main page and return next page */ $this->roundTagA( $_tagA ); } } } //www.bkjia.com public function parseQuestUrl() { self::import('http.Http'); $_round_Arr = $this->getCapInstance( Http::get( self::$_CapSite['request_url'] ) ); $this->roundTagA( $_round_Arr ); } } ?>