//Include PHP Simple html Dom class library file
include_once ('./simplehtmldom/simple_html_dom.php');
//Collect html
function getwebcontent($url){
$ch = curl_init();
$timeout = 10;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION , 1);
$contents = trim(curl_exec($ch));
curl_close($ch);
return $contents;
}
// Get the title and url
$string =
getwebcontent('http://www.babytree.com/learn/zhunbeihuaiyun/jijibeiyun/2');
//Regular matching
Get the title and Address
preg_match_all ("/(.*)/",
$string, $out , PREG_SET_ORDER);
foreach($out as $key => $value){
$article['title'][] = $out[$key][2];
$article['link'][] = "http://www.babytree.com/learn/article/".$out[$key][1];
}
//According to url gets article content
foreach($article['link'] as $key=>$value){
$html = file_get_html($value);
$div = $html->find ('div[id=pagenum_0]');
$article[content][] = $div[0]->innertext;
}
//Title transcoding---really used This step is not necessary at this time - because we have to use utf8 in the first place
//It really can’t be saved as a file without transcoding
foreach($article[title] as $key=>$value){
$article[title][$key] = iconv('utf-8', 'gbk', $value);//Transcoding
}
//Save to file
$num = count ($article['title']);
for($i=0; $i<$num; $i++){
file_put_contents("{$article[title][$i]}.txt" , $article['content'][$i]);
}
/*I originally wanted to post it before 12 o'clock. . But look at it, it’s already 3:30. . . Even if it was yesterday
Originally, using regular expressions is the best and fastest way to obtain article content.
However, regular expressions are good, but regular expressions are really difficult! So I did a little research and found
Many people on the Internet are also using PHP Simple Dom. Although the efficiency is a bit slower, the effect is still good
It takes about 7/8 from including the class library file to writing the txt file. Seconds can be used for further optimization, especially the regular rules for obtaining article content. That is so disgusting
You can do some research*/
?>