Node.js+jade+mongodb+mongoose realizes the separation of crawlers into libraries and the generation of static files

Keywords: Javascript Mongoose Database MongoDB node.js

Next, this article. Node.js+jade crawls all blog articles to generate static html files Continue, in this article to achieve the acquisition and static file generation, in the actual acquisition project, should be first put into storage and then selectively generate static files.

So the database I choose is mongodb, why use this database, because this database is based on set, data operation is basically json, and dom module cheerio has a very large affinity, cheerio processing filtered data can be directly inserted into mongodb, without any processing, very convenient. Of course, the affinity with node.js goes without saying. More importantly, the performance is excellent. In this article, I will not specify the basic usage of mongodb. I will start with another article from 0 to write the basic usage of mongodb.

At this stage, I separated the crawler into two modules, collected the crawler.js, and generated the static file (makeHtml.js).

crawler.js:

  1 var http = require('http');
  2 var cheerio = require('cheerio');
  3 var mongoose = require('mongoose');
  4 mongoose.Promise = global.Promise;
  5 var DB_URL = 'mongodb://localhost:27017/crawler';
  6 
  7 var aList = []; //List information of blog articles
  8 var aUrl = []; //All blog articles url
  9 
 10 var db = mongoose.createConnection(DB_URL);
 11 db.on('connected', function (err) {
 12     if (err) {
 13         console.log(err);
 14     } else {
 15         console.log('db connected success');
 16     }
 17 });
 18 var Schema = mongoose.Schema;
 19 var arcSchema = new Schema({
 20     id: Number, //Article id
 21     title: String, //Article title
 22     url: String, //Article link
 23     body: String, //Article content
 24     entry: String, //abstract
 25     listTime: Date //Release time
 26 });
 27 var Article = db.model('Article', arcSchema);
 28 
 29 function saveArticle(arcInfo) {
 30     var arcModel = new Article(arcInfo);
 31     arcModel.save(function (err, result) {
 32         if (err) {
 33             console.log(err);
 34         } else {
 35             console.log(`${arcInfo['title']}   Insert success`);
 36         }
 37     });
 38 }
 39 
 40 function filterArticle(html) {
 41     var $ = cheerio.load(html);
 42     var arcDetail = {};
 43     var title = $("#cb_post_title_url").text();
 44     var href = $("#cb_post_title_url").attr("href");
 45     var re = /\/(\d+)\.html/;
 46     var id = href.match(re)[1];
 47     var body = $("#cnblogs_post_body").html();
 48     return {
 49         id: id,
 50         title: title,
 51         url: href,
 52         body: body
 53     };
 54 }
 55 
 56 function crawlerArc(url) {
 57     var html = '';
 58     var str = '';
 59     var arcDetail = {};
 60     http.get(url, function (res) {
 61         res.on('data', function (chunk) {
 62             html += chunk;
 63         });
 64         res.on('end', function () {
 65             arcDetail = filterArticle(html);
 66             saveArticle(arcDetail);
 67             if ( aUrl.length ) {
 68                 setTimeout(function () {
 69                     if (aUrl.length) {
 70                         crawlerArc(aUrl.shift());
 71                     }
 72                 }, 100);
 73             }else {
 74                 console.log( 'Collection task completed' );
 75                 return;
 76             }
 77         });
 78     });
 79 }
 80 
 81 function filterHtml(html) {
 82     var $ = cheerio.load(html);
 83     var arcList = [];
 84     var aPost = $("#content").find(".post-list-item");
 85     aPost.each(function () {
 86         var ele = $(this);
 87         var title = ele.find("h2 a").text();
 88         var url = ele.find("h2 a").attr("href");
 89         ele.find(".c_b_p_desc a").remove();
 90         var entry = ele.find(".c_b_p_desc").text();
 91         ele.find("small a").remove();
 92         var listTime = ele.find("small").text();
 93         var re = /\d{4}-\d{2}-\d{2}\s*\d{2}[:]\d{2}/;
 94         listTime = listTime.match(re)[0];
 95 
 96         arcList.push({
 97             title: title,
 98             url: url,
 99             entry: entry,
100             listTime: listTime
101         });
102     });
103     return arcList;
104 }
105 
106 function nextPage(html) {
107     var $ = cheerio.load(html);
108     var nextUrl = $("#pager a:last-child").attr('href');
109     if (!nextUrl) return getArcUrl(aList);
110     var curPage = $("#pager .current").text();
111     if (!curPage) curPage = 1;
112     var nextPage = nextUrl.substring(nextUrl.indexOf('=') + 1);
113     if (curPage < nextPage) crawler(nextUrl);
114 }
115 
116 function crawler(url) {
117     http.get(url, function (res) {
118         var html = '';
119         res.on('data', function (chunk) {
120             html += chunk;
121         });
122         res.on('end', function () {
123             aList.push(filterHtml(html));
124             nextPage(html);
125         });
126     });
127 }
128 
129 function getArcUrl(arcList) {
130     for (var key in arcList) {
131         for (var k in arcList[key]) {
132             aUrl.push(arcList[key][k]['url']);
133         }
134     }
135     crawlerArc(aUrl.shift());
136 }
137 
138 var url = 'http://www.cnblogs.com/ghostwu/';
139 crawler(url);

Other core modules have not changed much, mainly adding database connection, database creation, set creation (set equivalent to table in relational database), Schema (equivalent to table structure of relational database).
mongoose operates on the database (save: insert data). The file generation module is separated.

makeHtml.js file
 1 var fs = require('fs');
 2 var jade = require('jade');
 3 
 4 var mongoose = require('mongoose');
 5 mongoose.Promise = global.Promise;
 6 var DB_URL = 'mongodb://localhost:27017/crawler';
 7 
 8 var allArc = [];
 9 var count = 0;
10 
11 var db = mongoose.createConnection(DB_URL);
12 db.on('connected', function (err) {
13     if (err) {
14         console.log(err);
15     } else {
16         console.log('db connected success');
17     }
18 });
19 var Schema = mongoose.Schema;
20 var arcSchema = new Schema({
21     id: Number, //Article id
22     title: String, //Article title
23     url: String, //Article link
24     body: String, //Article content
25     entry: String, //abstract
26     listTime: Date //Release time
27 });
28 var Article = db.model('Article', arcSchema);
29 
30 function makeHtml(arcDetail) {
31     str = jade.renderFile('./views/layout.jade', arcDetail);
32     ++count;
33     fs.writeFile('./html/' + count + '.html', str, function (err) {
34         if (err) {
35             console.log(err);
36         }
37         console.log( `${arcDetail['id']}.html Create success` + count );
38         if ( allArc.length ){
39             setTimeout( function(){
40                 makeHtml( allArc.shift() );
41             }, 100 );
42         }
43     });
44 }
45 
46 function getAllArc(){
47     Article.find( {}, function( err, arcs ){
48         allArc = arcs;
49         makeHtml( allArc.shift() );
50     } ).sort( { 'id' : 1 } );
51 }
52 getAllArc();

Posted by GundamSV7 on Mon, 10 Dec 2018 01:48:05 -0800