PHP制作百度词典查词采集器_php实例

原创
2016-06-07 17:14:24 851浏览

百度dict 采集样本

写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~

 音标
	 *				"pro"	 => 发音
	 *				"example"=> 例句
	 *				"explain"=> 简明释义
	 *				"synonym"=> 同反义词
	 *				"phrase" => 短语数组
	 *			)
   *
	 */
	public function content($word){
		 $this -> word = $word;
		 $symbol = $this -> Pronounced();
		 $pro	 = $this->getSay();
		 $example = $this -> getExample();
		 $explain = $this -> getExplain();
		 $synonym = $this -> getSynonym();
		 $phrase = $this -> getPhrase();
		 $result = array(
				"symbol" => $symbol,		//音标
				"pro"	 => $pro,			//发音
				"example"=> $example,		//例句
				"explain"=> $explain,		//简明释义
				"synonym"=> $synonym,		//同反义词
				"phrase" => $phrase 		//短语数组
			);
		return $result;
	}


	/**
   * 远程获取百度翻译内容
   * get function curl
   * retun string
   *
	 */

	private function getContent(){
 		$useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";
 		$ch = curl_init();
 		$url = "http://dict.baidu.com/s?wd=".$this->word;
 		curl_setopt($ch, CURLOPT_URL, $url);
 		curl_setopt($ch, CURLOPT_USERAGENT,$useragent);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); 
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 
		curl_setopt($ch, CURLOPT_HTTPGET, 1);
		curl_setopt($ch, CURLOPT_AUTOREFERER,1);
		curl_setopt($ch, CURLOPT_HEADER, 0); 
		curl_setopt($ch, CURLOPT_TIMEOUT, 30);
		$result = curl_exec($ch);
		if (curl_errno($curl)) {
			echo 'Errno'.curl_error($curl);
		}
		curl_close($ch);
		return $result;
	}


	/**
   * 获取百度翻译发音
   * retun array(英,美)
   *
	 */

	private function Pronounced(){
		$data = $this -> getContent();
		preg_match_all("/\"EN\-US\"\>(.*)\/Ui",$data,$pronounced);
		return array(
			'en' => $pronounced[1][0],
			'us' => $pronounced[1][1]
		);
	}

	/**
	 * 获取百度翻译发音
	 * return array(英,美)
	 *
	 */

	private function getSay(){
		$data = $this -> getContent();
		preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced);
		return array(
			'en' => $pronounced[1][0],
			'us' => $pronounced[1][1]
		);	
	}

	/**
   * 获取百度翻译例句
   * return array() 多维数组 例句
   * 
	 */

	private function getExample(){
		$str = "";
		$data = $this -> getContent();
		preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example);
	  $data1 = "[[[".ltrim($example[1][0],"[");
	  $data2 = explode("[[[",$data1);
	  $num = count(array_filter($data2));
		foreach($data2 as $key => $value){
		 	$data3 = explode("[[","[[".$value);
		 	foreach ($data3 as $k => $v) {
		 		preg_match_all("/\[\"(.*)\",/Us","[".$v, $match);
		 		if(!empty($match[1])){
		 			$str .= implode($match[1]," ")."@";
		 		}
		 	}
		}
		$data4 = trim($str,"@");
		$data5 = explode("@", $data4);
		$result = array_chunk($data5, 2);
		return $result;
	}

	/**
   * 获取简明释义
   * return array (x => "词性",b => "附属")
   * 
	 **/

	private function getExplain(){
		$data = $this -> getContent();
		preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\
/Us",$data,$explain); $r_data = $explain[1][0]; preg_match_all("/\

\(?P.*)\\(?P.*)\\/Us", $r_data, $a_data); preg_match_all("/\(?P[^\>]+)\:\(?P.*)\\/Us", $r_data, $b_data); $result = array(); foreach ($a_data["adj"] as $key => $value) { $result[$value] = $a_data["name"][$key]; } $word_b = array(); foreach ($b_data["tag"] as $key => $value) { $word_b[$value] = strip_tags($b_data["word"][$key]); } $result_data = array("x" => $result,"b" => $word_b); return $result_data; } /** * 获取同义词 * return array(0 => "同义词", 1 => "反义词") 一般为多维数组 * */ private function getSynonym(){ $data = $this -> getContent(); preg_match_all("/id=\"en\-syn\-ant\"\>(.*)

/Us",$data,$synonym); $content = $synonym[1][0]; $data1 = explode("", $content); $result = array(); $data2 = array(); foreach ($data1 as $key => $value) { preg_match_all("/\(?P.*)\&nbsp\;\\\
\
    (?.*)\/Us", $value, $r_data); $data2[$key]["adj"] = $r_data["adj"]; $data2[$key]["content"] = $r_data["content"]; } foreach ($data2 as $key => $value) { foreach ($value["content"] as $k => $v) { if(!empty($v)){ preg_match_all("/\
  • \

    (?P

    .*)\(?P<value>.*)\/Us", $v, $v_data); foreach ($v_data['title'] as $m => $d) { $data = strip_tags(preg_replace(">"," ", $v_data["value"][$m])); $result[$key][$value["adj"][$k]][$d] = $data; } } } } return $result; } /** * 获取短语词组 * return array (key => value) 一维或者多维数组 * */ private function getPhrase(){ $num = self::$num; $data = $this -> getContent(); preg_match_all("/id=\"en\-phrase\"\>(.*)\<div class>/Us",$data,$phrase); $data = explode("",$phrase[1][0]); $data1 = array_slice($data,0,$num); $result = array(); foreach ($data1 as $key => $value) { $data2 = explode("", $value); $n = count($data2); if($n $value) { foreach ($value as $k => $v) { $value[$k] = strip_tags($v); } $array = array($result[$key_value],$value); if (array_key_exists($key_value, $result)){ $result[$key_value] = $array; } } } } return $result; } /** * 将数组转换为字符串 * * @param array $data 数组 * @param bool $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1 * @return string 返回字符串,如果,data为空,则返回空 */ private function array2string($data, $isformdata = 1) { if($data == '') return ''; if($isformdata) $data = $this->new_stripslashes($data); return addslashes(var_export($data, TRUE)); } /** * 返回经stripslashes处理过的字符串或数组 * @param $string 需要处理的字符串或数组 * @return mixed */ private function new_stripslashes($string) { if(!is_array($string)) return stripslashes($string); foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val); return $string; } } // $word = new dict("express"); // $word ->content(); </div> <p>以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢。</p></value>

声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn核实处理。