リクエストされたページからキーワードを抽出する-PHPチュートリアル-php.cn

リクエストされたページからキーワードを抽出する

WBOY

リリース： 2016-07-25 08:49:34

オリジナル

1082 人が閲覧しました

指定された URL 検索 Web ページからいくつかのキーワードを抽出できます

例えば、Code Zhuji のホームページからは、下の図のようなキーワードを抽出できますリクエストされたページからキーワードを抽出する

if(!empty($_REQUEST["url"])){
include 'class.keywords.php';
$keywords = new tagsugest();
$keywords- >_lang = 'es';
$keywords->_encoding = 'iso-8859-1';
$keywords->_catego = 'telecom';
$keywords->_keyCount = 100; のようなものです。パーセント %
$keywords->file($_REQUEST['url']);
#$keywords->readMetaKeyWords();
#$keywords->readHtmlKeyWords();
$keywords-> ;readAll();
echo '見つかったキーワード:

';
$i = 1;
foreach($keywords->get() as $word) echo $i++ .". $word
";
}
//url 例: http://www.codepearl.com
echo "
";
?>

コードをコピー

classキーワードugest{
var $_html = FALSE;
var $_keyCount = 5;
var $_keyWords = array();
var $_encoding = 'UTF-8';
var $_lang = 'es';
var $_catego = 'テレコム';
var $_url = '';
/**
* # メタキーワードを読む
*
*/
public function readMetaKeyWords() {
if (! $this->_html) return;
preg_match('/<[s] *メタ[s]*名前[s]*=[s]*"[s]*キーワード[s]*"[s]*コンテンツ[s]*=[s]*"?([^>"] *)"?[s]*[/]?[s]*>/is', $this->_html, $match);
//$tags = get_meta_tags($this->_url);
//echo $tags['keywords'];
if (count($match)) {
$this->_keyWords = array_unique(explode(',', preg_replace('/s/i', ' ', mb_strto lower($match[1], $this->_encoding))));
}
}
/**
* タグを削除します
*
* @param 混合 $string
*/
プライベート関数 rip_tags($string) {
// -- --- HTML タグを削除します -----
$string = preg_replace ('/<[^>]*>/', ' ', $string);
/* // -----制御文字を削除します -----
$string = str_replace("r", '', $string); // --- 空のスペースに置換します
$string = str_replace("n", ' ', $string) ; // --- スペースで置換
$string = str_replace("t", ' ', $string); // --- スペースに置換します
*/
// ----- 複数のスペースを削除します -----
$string =trim(preg_replace('/ {2,}/', ' ', $string) );
return $string;
}
/**
* # ページ本文または文字列からキーワードを読み取ります
*
*/
public function readHtmlKeyWords() {
if (! $this->_html) return;
if(!empty($ this->_keyWords)){
$implo = implode(' ',$this->_keyWords);
$this->>_html = $this->_html." ".$implo;
$this- >_keyWords = array();
}
$this->_html = str_replace(' ',' ', $this->_html);
# 不要な部分を削除します
$toRemove = array(' head', 'script', 'style', 'object', 'embed', 'noembed', 'applet', 'noframes', 'noscript');
foreach ($toRemove as $remove) $this-> ;_html = preg_replace("/.*?/is", ' ', $this->_html);
# コメントを削除します
$this->_html = preg_replace("//is", ' ', $this->_html);
# HTML タグを削除
$this->_html = mb_strto lower($this->rip_tags($this->_html), $this->_encoding);
$this->_html = htmlspecialchars_decode($this->_html );
# エンコードされた hmtl エンティティをデコードします
$this->_html = html_entity_decode ($this->gt;_html, ENT_COMPAT, $this->_encoding);
# 単語に分割します
$words = preg_split("/ [s]+|[t]+|[.]+|[,]+|[:]+|[;]+|[!]+|[?]+|[|]+/s", $this ->_html、-1、PREG_SPLIT_NO_EMPTY);
if (count($words)) {
$frequency = array_count_values($words);
unset($frequency['']);
if (count($frequency)) {
# ストップワードを削除と間句
include('stopwords_'.$this->_lang.'.php');
include('glodic_'.$this->_catego.'_'.$this->_lang.'.php ');
$punct = '~!@#$%^&*()_+|}{[];:'",<>./?`-=\';
foreach (array_keys ($frequency) as $word) {
if ( (in_array($word, $stopWords)) または (strspn($word, $punct) == strlen($word)) ){ unset($frequency[$word] ); }
}
$max = max($frequency);
$count = count($frequency);
$tot =round(($max * 100) / $count);
$tot2 =round(( $this->_keyCount * 100) / $count);
if($tot > $count){$tot = $tot / 2;}
if($tot2 > $count){$tot = $tot / 2;}
$showmax =round(($tot + $tot2) / 2);
foreach (array_keys($frequency) as $word) {
if ( in_array($word, $glodic) ){$frequency[$word] = $frequency[$word] + $showmax; }
}
# 頻度で並べ替えます
arsort($frequency, SORT_NUMERIC);
# キーワード配列に追加します
$i = 0;
foreach ($frequency as $word=>$count) {
if ( (! in_array($word, $this->_keyWords)) &&
(! is_numeric($word)) &&
(! empty($word)) ) {
$this->_keyWords[] = ( string)$word;
$i++;
if ($i == $showmax) Break;
}
}
}
}
}
/**
* エンコーディングをデフォルトの utf-8 から変更します
*
* @parammixed $enc
*/
プライベート関数エンコーディング($ enc = FALSE) {
if ($enc) $this->_encoding = $enc;
}
/**
* # ファイルまたは URL から読み取ります
*
* @parammixed $fileUrl
*/
public function file($fileUrl = FALSE) {
if ($fileUrl){ $this->_html = @file_get_contents($fileUrl);
$this- >_url = $fileUrl;
}
}
/**
* # HTML を文字列として定義します
*
* @parammixed $page
*/
public function html($page = FALSE) {
if ($page) $this->_html = $page ;
}
/**
* # メタキーワードと本文の両方を読み取ります
*
*/
public function readAll() {
if ($this->_html !== FALSE) {
$this->readMetaKeyWords();
$this->readHtmlKeyWords();
}
$this->_keyWords = array_unique($this->_keyWords);
}
/**
* # キーワードを配列として返します
*
*/
public function get() {
return $this->_keyWords;
}
}
?>

复制代