NodeJS implements web crawler function example code

零下一度
Release: 2017-06-26 10:28:16
Original
3895 people have browsed it

Previous words

This article will use nodeJS to implement a simple web crawler function

Web page source code

Use the http.get() method to obtain Web page source code, taking the headline page of the hao123 website as an example

http://tuijian.hao123.com/hotrank
Copy after login
var http = require('http'); http.get('http://tuijian.hao123.com/hotrank',function(res){var data = ''; res.on('data',function(chunk){ data += chunk; }); res.on('end',function(){ console.log(data); }) });
Copy after login

The results obtained are as follows:

          热点排行榜-头条新闻-hao123新闻导航_hao123上网导航     

电视剧

排名关键词搜索指数

综艺

排名关键词搜索指数
Copy after login
View Code

Filter data

Based on the variety show hot spots on the webpage

The relevant source code is as follows

Through analysis, it can be seen that the 'Variety Show' module and other modules are located in

;, among which, monkey="zy" in the inner div of variety show module, information 10 shows module are all located
, and the name of the variety show is located in

cheerio

How do we get useful data from the source code? First, nodeJS does not support document objects. If you want to use a stupid method, you can only use regular expressions to process

cheerio is specially customized by nodejs for the server, and can quickly and flexibly implement the JQuery core. It works on the DOM model, and is very efficient in parsing, operating and rendering

【Installation】

【Use】

It The usage method is quite similar to jQuery, and it is very easy to get started. Take the title of the top 10 most popular variety shows as an example. The rankings of the six parts of 'Today's Hot Topics', 'People's Livelihood Hot Topics', 'Movies', 'TV Series' and 'Variety Shows' are climbed down and placed in the array in the object named 'result'. The commands are 'ss', ' jr'、'ms'、'dy'、'dsj'、'zy'

[The code is as follows]

var http = require('http');var cheerio = require('cheerio'); http.get('http://tuijian.hao123.com/hotrank',function(res){var data = ''; res.on('data',function(chunk){ data += chunk; }); res.on('end',function(){ filter(data); }) });function filter(data){//保存搜索量前10的综艺节目标题var result = [];//将页面源代码转换为$对象var $ = cheerio.load(data);//查找每个综艺节目标题的外层divvar temp_arr = $('[monkey = "zy"]').find('.point-bd').find('.point-title');//将综艺节目标题依次保存到结果数组中temp_arr.each(function(index,item){ result.push($(item).text()); })//[ '变形计','来吧冠军','拜托了冰箱','昆仑决','天生是优我','姐姐好饿','脑力男人时代','奔跑吧兄弟','我想和你唱','玫瑰之旅' ] console.log(result); }
Copy after login

[The results are as follows]
var http = require('http');var cheerio = require('cheerio'); http.get('http://tuijian.hao123.com/hotrank',function(res){var data = ''; res.on('data',function(chunk){ data += chunk; }); res.on('end',function(){ filter(data); }) });function filter(data){//保存各部分搜索量前10的名称//对象名为榜单名,如'实时热点'//对象内容为10个标题名称组成的数组var result = {};//将页面源代码转换为$对象var $ = cheerio.load(data);//查找'实时热点'、'今日热点'、'民生热点'、'电影'、'电视剧'、'综艺'这6个榜单所在的divvar temp_div = $('.top-wrap');//保存榜单名称var temp_title = []; temp_div.each(function(index,item){//查找榜单名,并保存到temp_title文件夹中temp_title.push($(item).find('h2').text());//查找每类下每个标题的外层divvar temp_arr = $(item).find('.point-bd').find('.point-title');//将result下的每个榜单初始化为一个数组var innerResult = result[temp_title[index]] = [];//将节目标题依次保存到相应榜单的数组中temp_arr.each(function(_index,_item){ innerResult.push($(_item).text()) }) }) console.log(result); }
Copy after login

The above is the detailed content of NodeJS implements web crawler function example code. For more information, please follow other related articles on the PHP Chinese website!

source:php.cn
Statement of this Website
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template
About us Disclaimer Sitemap
php.cn:Public welfare online PHP training,Help PHP learners grow quickly!