Nodejs爬虫Demo

2024-11-01188阅读0评论ADMIN

Nodejs爬虫Demo

前言

此demo展示node-crawler与async结合串行执行，爬取小说网站，并将每个章节内容存入本地。
实例中爬取的某小说网站，只为学习不做商业行为，如有侵权请联系我。
源码至 github，包含ts版。

安装

npm install async crawler
# or
yarn add async crawler

demo

Nodejs开发最大的烦恼就是异步问题，async就是解决这个问题而存在的，这里不做过多的介绍。
先上代码：

const Crawler = require("crawler");
const async = require("async");
const fs = require("fs");
const path = require("path");

const url = `http://www.52ggd.com/book/5/5203`;

(async function () {
  /** 使用async列队，分为两个实例爬取 */
  /** waterfall函数可将上次的执行结果传递到下一次 */
  console.time("执行时间");
  console.log("爬虫开始");
  async.waterfall([getAllChapter, getContent], (error, res) => {
    if (error) {
      console.error(error);
    } else {
      console.log("爬虫结束");
    }
    console.timeEnd("执行时间");
  });
})();

/** 获取小说所有章节 */
function getAllChapter(cb) {
  var chapterList = [];
  const crawler = new Crawler({
    callback: function (error, res, done) {
      if (error) {
        console.error(error);
      }
      const $ = res.$;

      const container = $(".chapterlist");
      const lis = $(container[0]).children("dd");

      lis.map((index, i) => {
        chapterList.push({
          /** 这里路由拼进去一个/ */
          link: `${url}/${$(i).children("a").attr("href")}`,
          name: $(i).children("a").text(),
        });
      });

      done();
    },
  });

  /** 选取小说的目录页 */
  crawler.queue(url);

  /** 队列结束调用 */
  crawler.on("drain", async () => {
    console.log(`所有章节已导出 共${chapterList.length}章`);
    /** async 回调，第一个参数为错误信息，没有错误就null */
    /** chapterList 最终会传递到下一个队列 */
    cb(null, chapterList);
  });
}

/** 获取小说内容 */
function getContent(chapter, cb) {
  const crawler = new Crawler({
    maxConnections: 200 /** 队列执行的个数，设置大些，防止章节过多丢失 */,
    // rateLimit: 1,
    callback: async function (error, res, done) {
      if (error) {
        console.error(error);
      }
      const $ = res.$;

      const contentBox = $("#BookText");
      const content = $(contentBox[0]).text();

      console.log(`开始爬取【${res.options.filename}】`);

      const localPath = path.resolve(__dirname, `txt`);
      !fs.existsSync(localPath) && fs.mkdirSync(localPath);
      await fs
        .createWriteStream(
          path.resolve(localPath, `${res.options.filename}.txt`)
        )
        .write(content);

      done();
    },
  });

  /** queue队列并行执行 */
  crawler.queue(
    chapter.map(function (m) {
      return { uri: m.link, filename: m.name };
    })
  );

  crawler.on("drain", async function () {
    console.log(`所有内容写入完毕，存入：${path.resolve(__dirname, `txt`)}`);
    cb(null);
  });
}

文章版权声明：除非注明，否则均为猿易帮原创文章，转载或复制请以超链接形式并注明出处。

Ubuntu16.04部署Gitlab服务

Nodejs crawler 爬虫

相关阅读

使用Nodejs写一个英雄联盟英雄故事网站

Nodejs crawler 爬虫

发表评论取消回复

评论列表（暂无评论，188人围观）

还没有评论，来说两句吧...

目录[+]

微信二维码

微信二维码

支付宝二维码