node.js - node crawler, how to use IP pool to prevent anti-crawling?
为情所困
为情所困 2017-06-05 11:13:33
0
1
1028

The problem is this. I have just started learning node. Of course, it is obviously for crawlers. Then I was reading a novel recently, but there were too many ads on those free novel websites, so I planned to write a crawler to crawl down the entire novel, but the number of URL requests was too frequent, so that it would be reverse-crawled and blocked. I tried increasing the request interval to avoid it, but I didn’t find any effect. I accidentally discovered that I can change the IP address to avoid it. However, there are too few resources in this area on Baidu, so I came here to ask the experts. If you know anything, you can share it. Just for a moment, thank you, God.

To emphasize, the problem is how to change the node IP to avoid anti-climbing

The frameworks used include superagent, cheerio, async...Thank you, God.
Code:

var superagent = require('superagent');
var cheerio = require('cheerio');
var file = require('./writeText.js');
require('superagent-charset')(superagent);

var str = '';
var count = 150;

var fetchUrl = function (url, callback, len) {
  count++;
  getArticle(url, callback, len);
}

function getArticle (url, callback, len) {
  superagent.get(url)// 'http://m.kanshuzw.com/4/4201/'
  .proxy(proxy)
  .charset('gbk')
  .end(function (err, sres) {
    if (err) {
      return console.error(err);
    } else {
      var $ = cheerio.load(sres.text);
      file.writeFile($('#nr_title').text() + '\n' + $('#nr').text(), 'C:\Users\Administrator\Desktop\nodeTextTest\writeFileTest-' + count + '.txt');
      $('.nr_page .dise').each(function (idx, element) {
        var $element = $(element);
        if ($element.text() === '下一页') {
          superagent.get('http://m.kanshuzw.com' + $element.attr('href'))
          .set("X-Forwarded-For", ip)
          .proxy(proxy)
          .charset('gbk')
          .end(function (err, sres) {
            if (err) {
              console.log('2 error end.');
              return console.error(err);
            }
            var $ = cheerio.load(sres.text);
            file.appendFile($('#nr_title').text() + '\n' + $('#nr').text(), 'C:\Users\Administrator.lic-PC\Desktop\nodeTextTest\writeFileTest-' + count + '.txt');
            console.log('progress: ' + count + '/' + len);
            callback(null, url + ' html content');
          });
        } else if ((idx + 1) === $('.nr_page .dise').length) {
          console.log('progress: ' + count + '/' + len);
          callback(null, url + ' html content');

        }
      });
    }
  })
}


Waiting for the great god ing

为情所困
为情所困

Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template
About us Disclaimer Sitemap
php.cn:Public welfare online PHP training,Help PHP learners grow quickly!