• 技术文章 >php教程 >PHP源码

    采集 PHP

    PHP中文网PHP中文网2016-05-25 17:15:10原创579
    充分利用正则的强大字串处理能力,使用简单,功能也比较简单,能满足一般应用,功能也在不断完善中,使用过程:设置一个初始url,添加导航规则,添加采集字段和规则,保存输出即可

    <?php
    set_time_limit(0);
    header("Content-type: text/html; charset=utf-8");
    /**
    * 采集程序类
    * @author  shooting
    * @version 1.0.0
    */
    class spider
    {
    /**
    * 采集的终端页地址
    *
    * @var array
    */
    var $pages = array();
    /**
    * 采集结果
    *
    * @var array
    */
    var $result = array();
    /**
    * 第一层链接页面
    *
    * @var array
    */
    var $startUrls = array();
    /**
    * 超时时间
    *
    * @var integer
    */
    var $timeout;
    /**
    * 正在处理的文件内容
    *
    * @var string
    */
    var $httpContent;
    /**
    * 正在处理的文件头
    *
    * @var array
    */
    var $httpHead=array();
    /**
    * 自定义的head数组
    *
    * @var array
    */
    var $putHead = array();
    /**
    * 采集字段与规则数组
    *
    * @var array
    */
    var $field_arr = array();
    /**
    * 采集层次数
    *
    * @var interger
    */
    var $deep;
    /**
    * 采集层次结构
    *
    * @var array
    */
    var $layout_arr = array();
    /**
    * 采集限制条数
    *
    * @var integer
    */
    var $limit = 0;
    /**
    * 程序运行时间
    *
    * @var float
    */
    var $runtime = 0;
    /**
    * 被采集页面编码
    *
    * @var string
    */
    var $charset = 'UTF-8';
    /**
    * 页面引用地址
    *
    * @var string
    */
    var $httpreferer;
    var $pagelimit = 0;
    var $filepath = './';
    function spider()
    {
      $this->timeout = 30;
    }
    /**
    * 运行采集
    *
    * @return array
    */
    function run()
    {
      $begintime = $this->microtime_float();
      $cnt = 1;
      foreach ($this->startUrls as $starturl){
      /**
       * 解析出起始地址中的页码区间
       */
       if(preg_match("~\{(\d+),(\d+)\}~",$starturl,$pagenum)){
        $pagebegin = intval($pagenum[1]);
        $pageend = intval($pagenum[2]);
        for(;$pagebegin<=$pageend;$pagebegin++){
         $starturl = str_replace($pagenum[0],$pagebegin,$starturl);
         $urllists = $this->getLists($this->layout_arr[0]['pattern'],$this->getContent($starturl));
         foreach ($urllists as $url){
          if(($this->limit > 0 && $cnt <= $this->limit)||$this->limit == 0)
          {
           $this->filterContent($this->getContent($url,$starturl));
           $cnt++;
          }
         }
        }
       }else{
        $urllists = $this->getLists($this->layout_arr[0]['pattern'],$this->getContent($starturl));
        foreach ($urllists as $url){
         if(($this->limit > 0 && $cnt <= $this->limit)||$this->limit == 0)
         {
          $this->filterContent($this->getContent($url,$starturl));
          $cnt++;
         }
        }
       }
      }
      $this->runtime =  $this->microtime_float()-$begintime;
      return $this->result;
    }
    /**
    * 从文字段中根据规则提取出url列表
    *
    * @param string $pattern
    * @param string $content
    * @return Array
    */
    function getLists($pattern='',$content='')
    {
      if(strpos($pattern,'{*}') === false)return array($pattern);
      $pattern = preg_quote($pattern);
      $pattern = str_replace('\{\*\}','([^\'\">]*)',$pattern);
      $pattern = "~".$pattern."~is";
      preg_match_all($pattern,$content,$preg_rs);
      return array_unique($preg_rs[0]);
    }
    /**
    * 获取指定url的html内容包括头
    *
    * @param string $url
    * @return string
    */
    function getContent($url,$referer = '')
    {
      $url = $this->urlRtoA($url,$referer);
      preg_match("/(http:\/\/)([^:\/]*):?(\d*)(\/?.*)/i",$url,$preg_rs);
      $host = $preg_rs[2];
      $port = empty($preg_rs[3])?80:$preg_rs[3];
      $innerUrl = $preg_rs[4];
      $fsp = fsockopen($host,$port,$errno,$errstr,$this->timeout);
      if(!$fsp)$this->log($errstr.'('.$errno.')');
      $output = "GET $url HTTP/1.0\r\nHost: $host\r\n";
      if(!isset($this->putHead['Accept']))$this->putHead['Accept']= "*/*";
      if(!isset($this->putHead['User-Agent']))$this->putHead['User-Agent']='Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2)';
      if(!isset($this->putHead['Refer'])){
       $this->putHead['Refer'] = ($referer == '')?'http://'.$host:$referer;
      }
      foreach ($this->putHead as $headname => $headvalue){
       $output .= trim($headname).': '.trim($headvalue)."\r\n";
      }
      $output .= "Connection: close\r\n\r\n";
      fwrite($fsp,$output);
      $content = '';
      while (!feof($fsp)) {
            $content .= fgets($fsp,256);
        }
        fclose($fsp);
        $this->getHead($content);
        $this->httpContent = $content;
        if(strtoupper($this->charset) != 'UTF-8'){
         $content = iconv($this->charset,'utf-8',$content);
        }else if(!empty($this->httpHead['charset']) && $this->httpHead['charset']!='UTF-8')
        {
        $content = iconv($this->httpHead['charset'],'utf-8',$content);
        }
        $this->httpreferer = $referer;
      return $content;
    }
    /**
    * 按照规则从内容提取所有字段
    * @param Array
    * @return Array
    */
    function filterContent($content='')
    {
      $rs = array();
      foreach ($this->field_arr as $field => $fieldinfo){
       $rs[$field] = $this->getPregField($fieldinfo,$content);
      }
      $this->result[] = $rs;
    }
    /**
    * 相对路径转化为绝对路径
    *
    * @param string $relative
    * @param string $referer
    * @return string
    */
    function urlRtoA($relative,$referer)
    {
      /**
       * 去除#后面的部分
       */
      $pos = strpos($relative,'#');
      if($pos >0)$relative = substr($relative,0,$pos);
      /**
       * 检测路径如果是绝对地址直接返回
       */
      if(preg_match("~^(http|ftp)://~i",$relative))
      return $relative;
      /**
       * 解析引用地址,获得协议,主机等信息
       */
      preg_match("~((http|ftp)://([^/]*)(.*/))([^/#]*)~i", $referer, $preg_rs);
      $parentdir = $preg_rs[1];
      $petrol = $preg_rs[2].'://';
      $host = $preg_rs[3];
      /**
       * 如果以/开头的情况
       */
      if(preg_match("~^/~i",$relative))
       return $petrol.$host.$relative;
      return $parentdir.$relative;
    }
    /**
    * 根据规则提取一个字段
    *
    * @param string $pattern
    * @param string $content
    * @return string
    */
    function getPregField($fieldinfo,$content)
    {
      /**
       * 规则为固定值的情况,直接返回固定值
       */
      if(strpos($fieldinfo['pattern'],'{'.$fieldinfo['field'].'}') === false)
      return $fieldinfo['pattern'];
      if($fieldinfo['isregular'] == 'true'){
       $pattern = $fieldinfo['pattern'];
       $pattern = str_replace('{'.$fieldinfo['field'].'}','(?P<'.$fieldinfo['field'].'>.*?)',$pattern);
      }else{
       $pattern = preg_quote($fieldinfo['pattern']);
       $pattern = str_replace('\{'.$fieldinfo['field'].'\}','(?P<'.$fieldinfo['field'].'>.*?)',$pattern);
      }
      $pattern = "~".$pattern."~is";
      preg_match($pattern,$content,$preg_rs);
      $fieldresult = $preg_rs[$fieldinfo['field']];
      /**
       * 去掉换行符
       */
      $fieldresult = preg_replace("~[\r\n]*~is",'',$fieldresult);
      /**
       * 对采集到的结果根据规则再进行二次替换处理
       */
      $replace_arr = $fieldinfo['replace'];
      if(is_array($replace_arr)){
       $replace_arr[0] = "~".$replace_arr[0]."~s";
       $fieldresult = preg_replace($replace_arr[0],$replace_arr[1],$fieldresult);
      }
      /**
       * 针对有下一页的字段递归采集
       */
      if($this->pagelimit == 0){
      if($fieldinfo['nextpage'] != ''){
       $pattern = $fieldinfo['nextpage'];
       $pattern = str_replace('{nextpage}','(?P<nextpage>[^\'\">]*?)',$pattern);
       $pattern = "~".$pattern."~is";
       if(preg_match($pattern,$content,$preg_rs) && $preg_rs['nextpage'] != ''){
        $fieldresult .= $this->getPregField($fieldinfo,$this->getContent($preg_rs['nextpage'],$this->httpreferer));
       }
      }
      }
      if(!empty($fieldinfo['callback']))$fieldresult = $fieldinfo['callback']($fieldresult);
      return $fieldresult;
    }
    /**
    * 添加一个采集字段和规则
    *
    * @param string $field
    * @param string $pattern
    */
    function addField($field,$pattern,$replace_arr='',$isregular='false',$nextpage = '',$callback='')
    {
      $rs = array(
        'field' => $field,
        'pattern' => $pattern,
        'replace' => $replace_arr,
        'isregular' => $isregular,
        'nextpage' => $nextpage,
        'callback'=>$callback
        );
      $this->field_arr[$field] =$rs;
    }
    /**
    * 输出
    *
    */
    function output()
    {
      echo "The result is:<br/>";
      echo "runtime :$this->runtime S<br/><pre>";
      print_r($this->result);
      echo "</pre>";
    }
    /**
    * 输出到XLS文件
    *
    * @param string $file
    */
    function saveXls($file = 'spider_result.xls')
    {
      $fp = fopen($file,'w');
      if($fp){
       foreach ($this->result as  $result)
       {
        $line = implode("\t",$result)."\n";
        fputs($fp,$line);
       }
      }
      fclose($fp);
      echo 'The result has been saved to '.$file.'.<br/>Cost time:'.$this->runtime;
    }
    function saveSql($table = 'spider_result',$file = 'spider_result.sql')
    {
      $fp = fopen($file,'w');
      if($fp){
       foreach($this->field_arr as $fieldinfo){
        $sql_key .= ', `'.$fieldinfo['field'].'`';
       }
       $sql_key = substr($sql_key,1);
       foreach ($this->result as  $result)
       {
        $sql_value = array();
        foreach ($result as $key => $value){
         $sql_value[] = "'".$this->addslash($value)."'";
        }
        $line ="INSERT INTO `$table` ( $sql_key ) VALUES (".join(', ',$sql_value).");\r\n";
        fputs($fp,$line);
       }
      }
      fclose($fp);
      echo 'The result has been saved to '.$file.'.<br/>Cost time:'.$this->runtime;
    }
    /**
    * 取得响应内容的头部信息
    *
    * @param string $content
    * @return array
    */
    function getHead($content)
    {
      $head = explode("\r\n\r\n",$content);
      $head = $head[0];
    //  echo $head;
      if(!preg_match("~charset\=(.*)\r\n~i",$head,$preg_rs))
      preg_match('~charset=([^\"\']*)~i',$content,$preg_rs);
      $this->httpHead['charset'] = strtoupper(trim($preg_rs[1]));
    //  preg_match("~charset\=(.*)~i",$head,$preg_rs);
      return $this->httpHead;
    }
    /**
    * 设置采集页面的编码
    * 在程序不能自动识别的情况下采集前要手动调用此函数
    *
    * @param string $charset
    */
    function setCharset($charset){
      $this->charset = strtoupper($charset);
    }
    /**
    * 设置第一层链接页面地址
    *
    * @param array $url_arr
    */
    function setStartUrls($url_arr)
    {
      $this->startUrls = $url_arr;
    }
    /**
    * 增加一个第一层链接页面地址
    *
    * @param string $url
    */
    function addStartUrl($url)
    {
      $this->startUrls[] = $url;
    }
    /**
    * 添加一个采集层次
    *
    * @param integer $deep
    * @param string $layout
    * @param boolean $isSimple
    * @param boolean $isPageBreak
    * @param string $pattern
    */
    function addLayer($deep,$layout,$pattern = '',$isSimple = 'false',$isPageBreak = 'false')
    {
      $this->layout_arr[$deep] = array(
             'layout'=>$layout,
             'isSimple'=>$isSimple,
             'isPageBreak'=>$isPageBreak,
             'pattern'=>$pattern );
    }
    /**
    * 自定义head
    * @param string $namespace
    * @param string $value
    */
    function setHead($name,$value)
    {
      $this->putHead[$name] = $value;
    }
    
    /**
    * 清除html代码
    *  @param string $content;
    *  @param string $cleartags
    *  @return string
    */
    function clearHtml($content,$cleartags = 'p')
    {
      $cleartags_arr = explode('|',$cleartags);
      foreach ($cleartags_arr as $cleartag){
       $pattern = '~<\/?'.$cleartag.'[^>]*>~is';
       $content = preg_replace($pattern,'',$content);
      }
      return $content;
    }
    /**
    * 日志
    *
    */
    function log($str)
    {
      echo $str."<br/>\n";
    }
    /**
    * 获取采集运行时间
    *
    * @return float
    */
    function getRuntime()
    {
      return $this->runtime;
    }
    function microtime_float()
    {
         list($usec, $sec) = explode(" ", microtime());
         return ((float)$usec + (float)$sec);
    }
    function addslash($string)
    {
      return addslashes($string);
    }
    }
    $spider  = new spider();
    $spider->addStartUrl('http://hi.baidu.com/shuntian/blog/index/{0,5}');
    $spider->setCharset('gb2312');
    $spider->addLayer(0,'list','/shuntian/blog/item/{*}.html');
    $spider->addField('title','<title>{title}</title>',array('_顺者的天空-shooting&#39;s sky      ',''));
    $spider->addField('body','<td><p id="blog_text" class="cnt"  >{body}</p></td></tr></table>');
    $spider->addField('author','shooting');
    $spider->run();
    $spider->saveSql();

    以上就是采集的内容,更多相关内容请关注PHP中文网(m.sbmmt.com)!

    声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn核实处理。
    专题推荐:采集
    上一篇:MVC的调度器和模板类 下一篇:Nginx 使用 PHP 进行 IMAP 的认证

    相关文章推荐

    • php学习笔记之面向对象编程• php中常用的函数集合• php 字符串操作函数 (1/2)• 为什么FleaPHP使用Table Data Gateway代替Active Record提供数据库• PHP禁止图片文件的被盗链函数

    全部评论我要评论

  • 取消发布评论发送
  • 1/1

    PHP中文网