PHP は Web ページのタイトルを抽出し、無関係な SEO キーワードを削除します-PHPチュートリアル-php.cn

PHP は Web ページのタイトルを抽出し、無関係な SEO キーワードを削除します

WBOY

リリース： 2016-06-13 13:04:01

オリジナル

1377 人が閲覧しました

PHP は Web ページのタイトルを抽出し、無関係な SEO キーワードを削除します
シーンの説明:

以前は、Web ページのタイトルを抽出する場合、間にあるコンテンツを直接抽出していましたが、実際の状況は次のようになります。たとえば、javaeye http:/ の記事です。 /www.iteye.com/news/ 21643、内容は「10 年間のソフトウェア開発が教えてくれた 10 の最も重要なこと - 非技術 - ITeye 情報」ですが、実際の引用では、タイトルは「The 10」になるはずです。 10 年間のソフトウェア開発が教えてくれた最も重要なことタイトルの後ろに無関係なキーワードがたくさんあるため、これらのキーワードを除外することができます。

1 . h1 などのタグを検索します (いくつかの新浪ニュースサイトを分析した結果、それは実現不可能であり、多くの干渉があると感じます)

2.全文からタイトルを削除し、その間の内容を a1、a2、a3、a4 と切り取り (_ | - を押します)、最長の語句 a3 から全文検索を開始します。クエリが失敗するまで、左側の a2 と a1 を繰り返しクエリします。左側が失敗したら、同じように右側に繰り返します (ここではこの方法を使用します)

<?php
/**
 * @author pqcc <struts.ec@mgail.com>
 * @date: 2011-06-18
 * Description: 给定一个网页内容，提取网页的标题. 提取的标题不包括 seo 关键字.
 * e.g: 一篇新闻标题的从<title>直接抽取结果为 "大学英语四六级本周六开考 909万人参考_新浪教育_新浪网",
 *       但我们希望的结果是:"大学英语四六级本周六开考 909万人参考".
 * 适用范围:  文章最终页标题的提取, 不包括专题页等.
 */

class TitlePurify{

    private $matches_preg = '[-_\s|―]';

    function getTitle($contents){/*{{{*/
        $preg = "/<title[^>]*>([\w|\t|\r|\W]*?)<\/title>/i";
        preg_match($preg, $contents, $matches);
        if(count($matches)<=1){
            return "标题抽取失败";
        }
        $title = $matches[1];
        return $this->trimTitle($title, $contents);
    }/*}}}*/

    function trimMeta($contents){/*{{{*/
        // 首先去除 <title> 内容, <meta> 内容.
        $preg       = "/<title[^>]*>([\w|\t|\r|\W]*?)<\/title>/i";
        $contents   = preg_replace($preg, '', $contents);
        $preg       = "/<meta[^>]*>/i";
        $contents   = preg_replace($preg, '', $contents);
        return $contents;
    }/*}}}*/


    // 获取长度最长的 item?所处的index.
    function getMaxIndex($titles){/*{{{*/
        $maxItemIndex   = 0;
        $maxLength      = 0;
        $loop           = 0;
        foreach($titles as $item){
            if(strlen($item)>$maxLength){
                $maxLength      = strlen($item);
                $maxItemIndex   = $loop;
            }        
            $loop++;
        }
        return $maxItemIndex;
    }/*}}}*/

    function trim($title, $titles, $contents, $maxItemIndex){/*{{{*/
        //@todo : 此处可优化contents
        // 如果查找成功. result = tempTitle. 
        $tempTitle  = $titles[$maxItemIndex];
        $result     = $tempTitle;
        $count      = count($titles);
        // while 从当前index 向左进行迭代(直到到达第一个或者匹配失败才中止).
        $leftIndex  = $maxItemIndex-1;
        while(true && $leftIndex>=0){
            // tempTitle+左一个.
            preg_match("/({$this->matches_preg}+{$tempTitle})/i", $title, $matches);
            if(count($matches)>1){
                // temp 用于匹配失败后,进行回滚.
                $temp       = $titles[$leftIndex] . $matches[1];
                $tempTitle  = $titles[$leftIndex] . $matches[1];
                // 继续拿着 tempTitle 去匹配.
                preg_match("/$tempTitle/i", $contents, $matches);
                // 如果查找失败....
                if(count($matches)<1){
                    $tempTitle = $temp;
                    break;
                }else{
                    $result = $tempTitle;
                }
            }else{ //?正常情况下,?不会出现该情况.
                break;
            }
            $leftIndex--;
        }
        // match(current[i-1].[|-].tempTitle), 如果成功, tempTitle = match 成功的值,继续.
        // while 左边失败后, 从右边开始.
        $rightIndex = $maxItemIndex+1;
        while(true && ($rightIndex<=$count)){
            preg_match("/({$tempTitle}{$this->matches_preg}+)/i", $title, $matches);
            if(count($matches)>1){
                // temp 用于匹配失败后,进行回滚.
                $temp       =  $matches[1] . $titles[$rightIndex];
                $tempTitle  =  $matches[1] . $titles[$rightIndex];
                // 继续拿着 tempTitle 去匹配.
                preg_match("/$tempTitle/i", $contents, $matches);
                // 如果查找失败....
                if(count($matches)<1){
                    $tempTitle = $temp;
                    break;
                }else{
                    $result = $tempTitle;
                }
            }else{ //?正常情况下,?不会出现该情况.
                break;
            }
            $rightIndex++;
        }
        return $result;

    }/*}}}*/

    function trimTitle($title, $contents){/*{{{*/
        
        $contents = $this->trimMeta($contents);    
        // 配置切割标题的规则.
        $titles = preg_split("/$this->matches_preg/i", $title);
        $count          = count($titles);
        //var_dump($titles);exit;

        // 将当前最长的 item 从全文查找.
        $maxItemIndex = $this->getMaxIndex($titles);
        $tempTitle   = $titles[$maxItemIndex];
        preg_match("/$tempTitle/i", $contents, $matches);
        // 如果查找失败....
        if(count($matches)<1){
            return $title;
        }
        return $this->trim($title, $titles, $contents, $maxItemIndex);
    }/*}}}*/

}

// -------------   test code ------------------------------
function convertEncoding($contents){
    preg_match("/charset=([\w|\-]+);?/i", $contents, $match);
    $charset = isset($match[1])? $match[1] : 'UTF-8';
    $contents = mb_convert_encoding($contents, 'UTF-8', $charset);
    return $contents;
}

$url = 'http://china.nba.com/news/4/2011/0617/61383331/10451.html';
$contents = file_get_contents($url);
$contents = convertEncoding($contents);

$startTime  = microtime();
$purify     = new TitlePurify();
$title      = $purify->getTitle($contents);
$endTime    = microtime();

echo "标题:        $title ";
echo "cost: " . ($endTime-$startTime);

?>

ログイン後にコピー

。