Rmm 分词算法代码片段

原创
2016-06-08 17:28:17 1019浏览

function SplitRMM($str=""){
if($str!="") $this->SetSource(trim($str));
if($this->SourceString=="") return "";
//对文本进行粗分
$this->SourceString = $this->ReviseString($this->SourceString);
//对特定文本进行分离
$spwords = explode(" ",$this->SourceString);
$spLen = count($spwords);
$spc = $this->SplitChar;
for($i=($spLen-1);$i>=0;$i--){
if(trim($spwords[$i])=="") continue;
if($this->NotGBK($spwords[$i])){
if(ereg("[^0-9.+-]",$spwords[$i]))
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else
{
$nextword = "";
@$nextword = substr($this->ResultString,0,strpos($this->ResultString," "));
if(ereg("^".$this->CommonUnit,$nextword)){
$this->ResultString = $spwords[$i].$this->ResultString;
}else{
$this->ResultString = $spwords[$i].$spc.$this->ResultString;
}
}
}
else
{
$c = $spwords[$i][0].$spwords[$i][1];
$n = hexdec(bin2hex($c));
if($c=="《") //书名
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else if($n>0xA13F && $n { $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else //正常短句
{
if(strlen($spwords[$i]) SplitLen)
{
//如果结束符为特殊分割词,分离处理
if(ereg($this->EspecialChar."$",$spwords[$i],$regs)){
$spwords[$i] = ereg_replace($regs[0]."$","",$spwords[$i]).$spc.$regs[0];
}
//是否为常用单位
if(!ereg("^".$this->CommonUnit,$spwords[$i]) || $i==0){
$this->ResultString = $spwords[$i].$spc.$this->ResultString;
}else{
$this->ResultString = $spwords[$i-1].$spwords[$i].$spc.$this->ResultString;
$i--;
}
}
else
{
$this->ResultString = $this->RunRMM($spwords[$i]).$spc.$this->ResultString;
}
}
}
}
return $this->ResultString;
}

声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn核实处理。