基于Snoopy的PHP近似完美获取网站编码

原创
2016-07-25 09:08:53 626浏览
基于Snoopy的PHP近似完美获取网站编码
用于php爬虫,获取编码准确率99.9%, 还有部分不能获取,求大牛完善
代码来源: 站云网 www.siteyun.com
先要到网上下载Snoopy.class.php
调用方法: echo $go->getCharset(); ?>

[code]url=$url; } //打开网站 private function open($url) { if($this->request!==null) { if($this->request->status==200) { return true; } else { return false; } } else { $this->request=new Snoopy(); $this->request->fetch($url); if($this->request->status==200) { $this->request->results=strtolower($this->request->results); $charset=$this->getCharset(); if($charset!="utf-8") { if($charset=="windows-1252") { $this->request->results=$this->uni_decode($this->request->results); } else { $this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset); } } return true; } else { return false; } } } //获取网站title,keywords,description public function getWebinfo() { $info=array( 'title'=>'', 'keywords'=>'', 'desc'=>'', 'ip'=>'' ); if(!$this->open($this->url)){return $info;exit;} // print_r($this->request->results);exit; preg_match('/([^>]*)<\/title>/si', $this->request->results, $titlematch ); if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0) { $info['title'] = strip_tags($titlematch[1]); } preg_match_all('/<[\s]*meta[\s]*name="?' . '([^>"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); $ft=0; foreach($match[1] as $mt) { if($mt=="keywords" || $mt=="description") { $ft=1; } } if($ft==0) { preg_match_all('/<[\s]*meta[\s]*content="?([^>"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); if (isset($match) && is_array($match) && count($match) == 3) { $originals = $match[0]; $names = $match[2]; $values = $match[1]; if (count($originals) == count($names) && count($names) == count($values)) { $metaTags = array(); for ($i=0, $limiti=count($names); $i < $limiti; $i++) { $metaTags[$names[$i]] = array ( 'html' => htmlentities($originals[$i]), 'value' => $values[$i] ); } } } } else { if (isset($match) && is_array($match) && count($match) == 3) { $originals = $match[0]; $names = $match[1]; $values = $match[2]; if (count($originals) == count($names) && count($names) == count($values)) { $metaTags = array(); for ($i=0, $limiti=count($names); $i < $limiti; $i++) { $metaTags[$names[$i]] = array ( 'html' => htmlentities($originals[$i]), 'value' => $values[$i] ); } } } } $result = array ( 'metaTags' => $metaTags ); if(isset($result['metaTags']['keywords']['value'])) { $info['keywords']=$result['metaTags']['keywords']['value']; } else { $info['keywords']=""; } if(isset($result['metaTags']['description']['value'])) { $info['desc']=$result['metaTags']['description']['value']; } else { $info['desc']=""; } $domain=preg_replace('/http\:\/\//si', '', $this->url); $ip=@gethostbyname($domain); $ip_arr=explode(".", $ip); if(count($ip_arr)==4) { $info['ip']=$ip; } return $info; } public function t($string,$o) { for($i=0;$i<strlen($string);$i++) { if(ord($string{$i})<128) continue; if((ord($string{$i})&224)==224) { //第一个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { //第二个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { $encoding = "UTF-8"; break; } } } if((ord($string{$i})&192)==192) { //第一个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { //第二个字节判断通过 $encoding = "GB2312"; break; } } } return strtolower($encoding); } function uni_decode ($str, $code = 'utf-8'){ $str = json_decode(preg_replace_callback('/&#(\d{5});/', create_function('$dec', 'return \'\\u\'.dechex($dec[1]);'), '"'.$str.'"')); if($code != 'utf-8'){ $str = iconv('utf-8', $code, $str); } return $str; } //获取网站编码 public function getCharset() { if(!$this->open($this->url)){return false;exit;} //首先从html获取编码 preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i",$this->request->results,$temp) ? strtolower($temp[1]):""; if($temp[1]!="") { if(in_array($temp[1], $this->charset_arr)) { if($temp[1]=="gb2312") { $tmp_charset=$this->t($this->request->results,$temp[1]); if($tmp_charset==$temp[1]) { return $temp[1]; } } else { return $temp[1]; } } } if(!empty($this->request->headers)) { //从header中获取编码 $hstr=strtolower(implode("</td></tr></table> </div> <div id="comment_19368" class="cm"> </div> <div id="post_rate_div_19368"></div> <br><br> </div> <div class="nphpQianMsg"><a href="//m.sbmmt.com/m/search?word=基于snoopy的php近似完美获取网站编码">基于Snoopy的PHP近似完美获取网站编码</a><a href="//m.sbmmt.com/m/course/list/29/type/2.html">PHP课程</a><a href="//m.sbmmt.com/m/course/list/11.html">HTML视频教程</a><a href="//m.sbmmt.com/m/course/list/12.html">CSS视频</a><a href="//m.sbmmt.com/m/course/list/17.html">JS视频教程</a><a href="//m.sbmmt.com/m/course/list/18.html">Vue视频教程</a><div class="clear"></div></div> <div class="nphpQianSheng"><span>声明:</span>本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn核实处理。</div> <!-- <div class="nphpFen"> <span><a href="javascript:;"><b></b>分享</a></span> <em class="icon1"><b></b>收藏</em> <i class="icon1"><b></b>点赞</i> <div class="clear"></div> </div> --> </div> <script type="text/javascript" src="//m.sbmmt.com/sw/hezuo/16e561434ad42f17963b25d80a9fabeb.html" ></script> <div class="nphpSytBox"> <span>上一条:<a class="dBlack" href="//m.sbmmt.com/m/faq/318739.html">利用curl,正则表达式做的一个php蜘蛛抓取器 </a></span> <span>下一条:<a class="dBlack" href="//m.sbmmt.com/m/faq/318745.html">一个SOAP Client的简单测试(带测试实例) </a></span> </div> <div class="wwads-cn wwads-horizontal" data-id="164" style="max-width:100%"></div> <div class="nphpSytBox1"> <div class="nphpShou"> <script type="text/javascript" src="//m.sbmmt.com/sw/hezuo/7295e38849324c4a3f458e359856409d.html" ></script> <div class="clear"></div> </div> <ul class="nphpBian"> <li> <a href="//m.sbmmt.com/m/blog/detail/1046940.html"><span class="bg1"></span><dl><dt>编程学习群</dt><dd>技术答疑交流</dd></dl><div class="clear"></div></a> </li> <li> <a href="javascript:window.location.href = 'https://mp.weixin.qq.com/s?__biz=Mzk0MTE2MDU0Ng==&mid=2247487529&idx=1&sn=f4f9feda0951312daebe69478f219e7a&chksm=c2d7f124f5a0783233d879a6de3c59165dc2520e15813860b5bace817a6e08e530d454abd3c6#rd';"><span class="bg2"></span><dl><dt>关注公众号</dt><dd>获取学习资源</dd></dl><div class="clear"></div></a> </li> <div class="clear"></div> </ul> </div> <div class="nphpSytBox2"> <div class="nphpZbktTitle"> <h2>相关文章</h2> <em><a href="//m.sbmmt.com/m/article.html" class="bBlack"><i>查看更多</i><b></b></a></em> <div class="clear"></div> </div> <ul class="nphpXgwzList"> <!-- <li><b></b><a href="//m.sbmmt.com/jump/go.php?url=https%3A%2F%2Fm.sbmmt.com%2Fvip_mobile.html" class="aBlack" style="color: red;" rel="nofollow">❤️‍🔥共22门课程,总价3725元,会员免费学</a><div class="clear"></div></li> --> <!-- <li><b></b><a href="//m.sbmmt.com/jump/go.php?url=https%3A%2F%2Fm.php.cn%2Farticle%2F496353.html" class="aBlack" style="color: red;" rel="nofollow">❤️‍🔥接口自动化测试不想写代码?</a><div class="clear"></div></li> --> <li><b></b><a href="//m.sbmmt.com/m/faq/595481.html" class="aBlack">PHP中的多重继承</a><div class="clear"></div></li> <li><b></b><a href="//m.sbmmt.com/m/faq/595375.html" class="aBlack">在Laravel的@if语句中如何获取当前URL?</a><div class="clear"></div></li> <li><b></b><a href="//m.sbmmt.com/m/faq/593896.html" class="aBlack">PHP报错:解析常量定义时遇到的问题</a><div class="clear"></div></li> <li><b></b><a href="//m.sbmmt.com/m/faq/593468.html" class="aBlack">如何使用PHP实现密码加密功能</a><div class="clear"></div></li> <li><b></b><a href="//m.sbmmt.com/m/faq/593680.html" class="aBlack">解决PHP Fatal error: Call to undefined function mysqli_connect() in file.php on line X</a><div class="clear"></div></li> </ul> </div> <script type="text/javascript" src="//m.sbmmt.com/sw/hezuo/cf85c41f1b0ce5f8359e5229784c31e4.html" ></script> <div class="nphpSytBox2"> <div class="nphpTjkcTitle"> <ul class="nphpTjkcMenu menu1"> <li class="current">热门课程</li> <div class="clear"></div> </ul> <div class="clear"></div> </div> <div class="nphpRmkcBox" style="padding-top:0px;"> <ul class="nphpRmkcList"> <li> <a href="//m.sbmmt.com/m/course/1134.html"><img data-src="//m.sbmmt.com/img/upload/course/000/000/068/62555a16c9bf9556.png" alt="TP6.0 搭建个人博客实战(玉女心经版)" class="lazyload" src="//m.sbmmt.com/img/upload/course/000/000/068/62555a16c9bf9556.png" onerror="this.src='/static/mobimages/moren/236_132.png'"/></a> <dl> <dt><a href="//m.sbmmt.com/m/course/1134.html" title="TP6.0 搭建个人博客实战(玉女心经版)" class="aBlack">TP6.0 搭建个人博客实战(玉女心经版)</a></dt> <dd> <span> <em> <h2>¥71</h2> <i>¥79</i> <div class="clear"></div> </em> <p>已抢186902个</p> </span> <b><a href="//m.sbmmt.com/m/course/1134.html">抢</a></b> <div class="clear"></div> </dd> </dl> <div class="clear"></div> </li> <li> <a href="//m.sbmmt.com/m/course/1112.html"><img data-src="//m.sbmmt.com/img/upload/course/000/000/068/625662615beae981.png" alt="php mysql实战:学生信息管理系统(玉女心经版)" class="lazyload" src="//m.sbmmt.com/img/upload/course/000/000/068/625662615beae981.png" onerror="this.src='/static/mobimages/moren/236_132.png'"/></a> <dl> <dt><a href="//m.sbmmt.com/m/course/1112.html" title="php mysql实战:学生信息管理系统(玉女心经版)" class="aBlack">php mysql实战:学生信息管理系统(玉女心经版)</a></dt> <dd> <span> <em> <h2>¥89</h2> <i>¥99</i> <div class="clear"></div> </em> <p>已抢67408个</p> </span> <b><a href="//m.sbmmt.com/m/course/1112.html">抢</a></b> <div class="clear"></div> </dd> </dl> <div class="clear"></div> </li> <li> <a href="//m.sbmmt.com/m/course/893.html"><img data-src="//m.sbmmt.com/img/upload/course/000/000/068/625558b87e512730.png" alt="CSS视频教程-玉女心经版" class="lazyload" src="//m.sbmmt.com/img/upload/course/000/000/068/625558b87e512730.png" onerror="this.src='/static/mobimages/moren/236_132.png'"/></a> <dl> <dt><a href="//m.sbmmt.com/m/course/893.html" title="CSS视频教程-玉女心经版" class="aBlack">CSS视频教程-玉女心经版</a></dt> <dd> <span> <em> <h2>¥62</h2> <i>¥69</i> <div class="clear"></div> </em> <p>已抢353509个</p> </span> <b><a href="//m.sbmmt.com/m/course/893.html">抢</a></b> <div class="clear"></div> </dd> </dl> <div class="clear"></div> </li> <li> <a href="//m.sbmmt.com/m/course/894.html"><img data-src="//m.sbmmt.com/img/upload/course/000/000/068/625558e4f11c8518.png" alt="JavaScript极速入门_玉女心经系列" class="lazyload" src="//m.sbmmt.com/img/upload/course/000/000/068/625558e4f11c8518.png" onerror="this.src='/static/mobimages/moren/236_132.png'"/></a> <dl> <dt><a href="//m.sbmmt.com/m/course/894.html" title="JavaScript极速入门_玉女心经系列" class="aBlack">JavaScript极速入门_玉女心经系列</a></dt> <dd> <span> <em> <h2>¥62</h2> <i>¥69</i> <div class="clear"></div> </em> <p>已抢667997个</p> </span> <b><a href="//m.sbmmt.com/m/course/894.html">抢</a></b> <div class="clear"></div> </dd> </dl> <div class="clear"></div> </li> <li> <a href="//m.sbmmt.com/m/course/762.html"><img data-src="//m.sbmmt.com/img/upload/course/000/013/745/5aab60e0ad5fc891.jpg" alt="零基础php开发视频教程VIP视频课" class="lazyload" src="//m.sbmmt.com/img/upload/course/000/013/745/5aab60e0ad5fc891.jpg" onerror="this.src='/static/mobimages/moren/236_132.png'"/></a> <dl> <dt><a href="//m.sbmmt.com/m/course/762.html" title="零基础php开发视频教程VIP视频课" class="aBlack">零基础php开发视频教程VIP视频课</a></dt> <dd> <span> <em> <h2>¥99</h2> <i>¥299</i> <div class="clear"></div> </em> <p>已抢123306个</p> </span> <b><a href="//m.sbmmt.com/m/course/762.html">抢</a></b> <div class="clear"></div> </dd> </dl> <div class="clear"></div> </li> <li> <a href="//m.sbmmt.com/m/course/1540.html"><img data-src="//m.sbmmt.com/img/upload/course/000/000/068/643cef38f10ac763.png" alt="前端基础到实战(HTML5+CSS3+ES6+NPM)" class="lazyload" src="//m.sbmmt.com/img/upload/course/000/000/068/643cef38f10ac763.png" onerror="this.src='/static/mobimages/moren/236_132.png'"/></a> <dl> <dt><a href="//m.sbmmt.com/m/course/1540.html" title="前端基础到实战(HTML5+CSS3+ES6+NPM)" class="aBlack">前端基础到实战(HTML5+CSS3+ES6+NPM)</a></dt> <dd> <span> <em> <h2>¥800</h2> <i>¥1200</i> <div class="clear"></div> </em> <p>已抢25005个</p> </span> <b><a href="//m.sbmmt.com/m/course/1540.html">抢</a></b> <div class="clear"></div> </dd> </dl> <div class="clear"></div> </li> </ul> </div> <div class="nphpShou2"> <a href="//m.sbmmt.com/m/app/" class="aRed"><b></b><em>打开APP,随时随地在线学习!</em><span></span></a> <div class="clear"></div> </div> </div> </div> <!--主体 end--> <!--底部菜单--> <div class="nphpFoot" id="nphpFoot" style="display:none;"> <script type="text/javascript" src="//m.sbmmt.com/sw/hezuo/7c9c0cc71ad595f7716f2f0c50381e48.html" ></script> </div> <!--底部菜单--> <!--右侧菜单--> <div class="nphpYouBox" style="display: none;"> <div class="nphpYouBg"> <div class="nphpYouTitle"><span onclick="$('.nphpYouBox').hide()"></span><a href="//m.sbmmt.com/m/"></a><div class="clear"></div></div> <ul class="nphpYouList"> <li><a href="//m.sbmmt.com/m/"><b class="icon1"></b><span>首页</span><div class="clear"></div></a></li> <li><a href="//m.sbmmt.com/m/course.html"><b class="icon2"></b><span>课程</span><div class="clear"></div></a></li> <li><a href="//m.sbmmt.com/m/article.html"><b class="icon3"></b><span>文章</span><div class="clear"></div></a></li> <li><a href="//m.sbmmt.com/m/wenda.html"><b class="icon4"></b><span>问答</span><div class="clear"></div></a></li> <li><a href="//m.sbmmt.com/m/blog.html"><b class="icon5"></b><span>博客</span><div class="clear"></div></a></li> <li><a href="//m.sbmmt.com/m/dic.html"><b class="icon6"></b><span>词典</span><div class="clear"></div></a></li> <li><a href="//m.sbmmt.com/m/course/type/99.html"><b class="icon7"></b><span>手册</span><div class="clear"></div></a></li> <li><a href="//m.sbmmt.com/m/xiazai/"><b class="icon8"></b><span>资源</span><div class="clear"></div></a></li> <li><a href="//m.sbmmt.com/m/search"><b class="icon9"></b><span>搜索</span><div class="clear"></div></a></li> <li><a href="//m.sbmmt.com/m/app/"><b class="icon10"></b><span>APP下载</span><div class="clear"></div></a></li> <li><a href="//m.sbmmt.com/m/mk.html"><b class="icon11"></b><span>PHP培训</span><em>新</em><div class="clear"></div></a></li> <div class="clear"></div> </ul> </div> </div> <!--右侧菜单 end--> <!--顶部导航--> <div class="nphpDing" style="display: none;"> <div class="nphpDinglogo"><a href="//m.sbmmt.com/m/faq/#"></a></div> <div class="nphpNavIn1"> <div class="swiper-container nphpNavSwiper1"> <div class="swiper-wrapper"> <div class="swiper-slide"><a href="//m.sbmmt.com/m/" >首页</a></div> <div class="swiper-slide"><a href="//m.sbmmt.com/m/course.html" >课程</a></div> <div class="swiper-slide"><a href="//m.sbmmt.com/m/map.html">路径</a></div> <div class="swiper-slide"><a href="//m.sbmmt.com/m/article.html" class="hover">文章</a></div> <div class="swiper-slide"><a href="//m.sbmmt.com/m/mk.html" title="PHP培训">PHP培训</a><b></b></div> <div class="swiper-slide"><a href="//m.sbmmt.com/m/coding.html">精品课</a></div> <div class="swiper-slide"><a href="//m.sbmmt.com/m/xiazai" >下载</a></div> <div class="clear"></div> </div> </div> <script> var swiper = new Swiper('.nphpNavSwiper1', { slidesPerView : 'auto' }); </script> </div> </div> <!--顶部导航 end--> </div> <script> (function () { var bp = document.createElement('script');var curProtocol = window.location.protocol.split(':')[0]; if (curProtocol === 'https') {bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';} else {bp.src = 'http://push.zhanzhang.baidu.com/push.js';} var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(bp, s); })(); var _hmt = _hmt || []; (function() { var hm = document.createElement("script"); hm.src = "https://m.sbmmt.com/hm.js?c0e685c8743351838d2a7db1c49abd56"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(hm, s); })(); </script> <script>isLogin = 0;</script> <script type="text/javascript" src="//m.sbmmt.com/m/static/layui/layui.js"></script> <script type="text/javascript" src="//m.sbmmt.com/m/static/js/global.js?4.9.47"></script> <script> var returnCitySN = ''; </script> <script type="text/javascript" charset="UTF-8" src="//m.sbmmt.com/ip/city.php?sign=91b10883918ab297abf0cd0d07975c48"></script> <script> var cname = returnCitySN.cname; var cname = cname.split("|"); function setCookie(name,value,iDay){ //name相当于键,value相当于值,iDay为要设置的过期时间(天) var oDate = new Date(); oDate.setDate(oDate.getDate() + iDay); document.cookie = name + '=' + value + ';path=/;domain=.php.cn;expires=' + oDate; } //读cookies function getCookiea(name) { var arr,reg=new RegExp("(^| )"+name+"=([^;]*)(;|$)"); if(arr=document.cookie.match(reg)) return arr[2]; else return null; } var ad = getCookiea('ad'); if(ad) { var num = parseInt(ad)+1; } else { var num = 1; } if(num <= 2000) { if( returnCitySN == '' || encodeURIComponent(returnCitySN.cname).indexOf('%E5%8C%97%E4%BA%AC') != -1 ) { $('#nphpFoot').remove(); $('#adTop').remove(); } else { $('#nphpFoot').show(); $('#adTop').show(); $(document).ready(function(){ $('.nphpQianCont').before('<div class="nphpShou"><a href="https://work.weixin.qq.com/kfid/kfc8cc2d6defcf3c202" targe="_blank" style="width:100%;" rel="nofollow" class="aRed"><b class="icon2"></b><em style=" font-style:normal">【'+cname[2]+'】PHP编程学习,咨询领取优惠!</em><span></span></a></div>') }) } setCookie('ad',num,1) } </script> </div> <script src="https://vdse.bdstatic.com//search-video.v1.min.js"></script> <link rel='stylesheet' id='_main-css' href='//m.sbmmt.com/m/static/css/viewer.min.css' type='text/css' media='all'/> <script type='text/javascript' src='//m.sbmmt.com/m/static/js/viewer.min.js?1'></script> <script type='text/javascript' src='//m.sbmmt.com/m/static/js/jquery-viewer.min.js'></script> <script> jQuery.fn.wait = function (func, times, interval) { var _times = times || -1, //100次 _interval = interval || 20, //20毫秒每次 _self = this, _selector = this.selector, //选择器 _iIntervalID; //定时器id if( this.length ){ //如果已经获取到了,就直接执行函数 func && func.call(this); } else { _iIntervalID = setInterval(function() { if(!_times) { //是0就退出 clearInterval(_iIntervalID); } _times <= 0 || _times--; //如果是正数就 -- _self = $(_selector); //再次选择 if( _self.length ) { //判断是否取到 func && func.call(_self); clearInterval(_iIntervalID); } }, _interval); } return this; } $("table.syntaxhighlighter").wait(function() { $('table.syntaxhighlighter').append("<p class='cnblogs_code_footer'><span class='cnblogs_code_footer_icon'></span></p>"); }); $(document).on("click", ".cnblogs_code_footer",function(){ $(this).parents('table.syntaxhighlighter').css('display','inline-table');$(this).hide(); }); $('.nphpQianCont').viewer({navbar:true,title:false,toolbar:false,movable:false,viewed:function(){$('img').click(function(){$('.viewer-close').trigger('click');});}}); $(function() { $(".nphpFen em").click(function(){ if($(this).hasClass("icon1")){ $(this).removeClass("icon1").addClass("icon2").html("<b></b>已收藏") }else{ $(this).removeClass("icon2").addClass("icon1").html("<b></b>收藏") } }) $(".nphpFen i").click(function(){ if($(this).hasClass("icon1")){ $(this).removeClass("icon1").addClass("icon2").html("<b></b>已点赞") }else{ $(this).removeClass("icon2").addClass("icon1").html("<b></b>点赞") } }) }) </script> </body> </html>