Node.js+jade crawls all blog articles to generate static html files

Keywords: Javascript node.js less MongoDB

This weekend, I patched the jade template engine to generate static html files. This article needs to know about jade and read my last article. First, I give their reference links:

Node.js Template Engine Tutorial-jade Speed and Actual Warfare 1-Basic Usage

Node.js Template Engine Tutorial-jade Speed and Actual Warfare 2-Process Control, Escape and Non-Escape

Node.js Template Engine Tutorial-jade Speed and Practical 3-mixin

Node.js Template Engine Tutorial-jade Quick Learning and Actual Warfare 4-Template Reference, Inheritance, Plug-in Use

Node.js Implements a Simple Crawler-Grab All List Information of Blog Articles

In the article I shared above, I grabbed a list of all the articles on my blog. I didn't collect them into the library, nor did I do anything else. In this article, we will sort out all the information of the list of articles collected above, and start collecting articles and generating static html files. First, look at my collection effect. My blog currently has 77 articles, and all of them are collected and generated in less than one minute. Here I cut off some pictures, the name of the file is generated by the id of the article, and the generated articles, I wrote a paper. Simple static template, all articles are generated from this template.

Project structure:

 

 

Well, next, let's talk about the main functions of this article:

1. Grabbing articles, mainly grabbing the title, content, hyperlink, article id (used to generate static html files)

2. Generate html files based on jade templates

First, how to grasp the article?

Very simple, follow Above The implementation of grabbing the list of articles is similar.

 1 function crawlerArc( url ){
 2     var html = '';
 3     var str = '';
 4     var arcDetail = {};
 5     http.get(url, function (res) {
 6         res.on('data', function (chunk) {
 7             html += chunk;
 8         });
 9         res.on('end', function () {
10             arcDetail = filterArticle( html );
11             str = jade.renderFile('./views/layout.jade', arcDetail );
12             fs.writeFile( './html/' + arcDetail['id'] + '.html', str, function( err ){
13                 if( err ) {
14                     console.log( err );
15                 }
16                 console.log( 'success:' + url );
17                 if ( aUrl.length ) crawlerArc( aUrl.shift() );
18             } );
19         });
20     });
21 }

The parameter url is the address of the article. After the content of the article is captured, the filter Article (html) is called to filter out the required article information (id, title, hyperlink, content). Then the template content is replaced by the renderFile api of jade.

After the template content is replaced, the html file must be generated, so when writing to the file with writeFile, use id as the name of the html file. This is the implementation of generating a static html file.

Next is the loop to generate the static html file, which is the following line:

if ( aUrl.length ) crawlerArc( aUrl.shift() );
 
aUrl saves the URLs of all the articles in my blog. After collecting one article each time, it deletes the URLs of the current article and lets the URLs of the next article come out and continue collecting.
 

Complete implementation code server.js:

  1 var fs = require( 'fs' );
  2 var http = require( 'http' );
  3 var cheerio = require( 'cheerio' );
  4 var jade = require( 'jade' );
  5 
  6 var aList = [];
  7 var aUrl = [];
  8 
  9 function filterArticle(html) {
 10     var $ = cheerio.load( html );
 11     var arcDetail = {};
 12     var title = $( "#cb_post_title_url" ).text();
 13     var href = $( "#cb_post_title_url" ).attr( "href" );
 14     var re = /\/(\d+)\.html/;
 15     var id = href.match( re )[1];
 16     var body = $( "#cnblogs_post_body" ).html();
 17     return {
 18         id : id,
 19         title : title,
 20         href : href,
 21         body : body
 22     };
 23 }
 24 
 25 function crawlerArc( url ){
 26     var html = '';
 27     var str = '';
 28     var arcDetail = {};
 29     http.get(url, function (res) {
 30         res.on('data', function (chunk) {
 31             html += chunk;
 32         });
 33         res.on('end', function () {
 34             arcDetail = filterArticle( html );
 35             str = jade.renderFile('./views/layout.jade', arcDetail );
 36             fs.writeFile( './html/' + arcDetail['id'] + '.html', str, function( err ){
 37                 if( err ) {
 38                     console.log( err );
 39                 }
 40                 console.log( 'success:' + url );
 41                 if ( aUrl.length ) crawlerArc( aUrl.shift() );
 42             } );
 43         });
 44     });
 45 }
 46 
 47 function filterHtml(html) {
 48     var $ = cheerio.load(html);
 49     var arcList = [];
 50     var aPost = $("#content").find(".post-list-item");
 51     aPost.each(function () {
 52         var ele = $(this);
 53         var title = ele.find("h2 a").text();
 54         var url = ele.find("h2 a").attr("href");
 55         ele.find(".c_b_p_desc a").remove();
 56         var entry = ele.find(".c_b_p_desc").text();
 57         ele.find("small a").remove();
 58         var listTime = ele.find("small").text();
 59         var re = /\d{4}-\d{2}-\d{2}\s*\d{2}[:]\d{2}/;
 60         listTime = listTime.match(re)[0];
 61 
 62         arcList.push({
 63             title: title,
 64             url: url,
 65             entry: entry,
 66             listTime: listTime
 67         });
 68     });
 69     return arcList;
 70 }
 71 
 72 function nextPage( html ){
 73     var $ = cheerio.load(html);
 74     var nextUrl = $("#pager a:last-child").attr('href');
 75     if ( !nextUrl ) return getArcUrl( aList );
 76     var curPage = $("#pager .current").text();
 77     if( !curPage ) curPage = 1;
 78     var nextPage = nextUrl.substring( nextUrl.indexOf( '=' ) + 1 );
 79     if ( curPage < nextPage ) crawler( nextUrl );
 80 }
 81 
 82 function crawler(url) {
 83     http.get(url, function (res) {
 84         var html = '';
 85         res.on('data', function (chunk) {
 86             html += chunk;
 87         });
 88         res.on('end', function () {
 89             aList.push( filterHtml(html) );
 90             nextPage( html );
 91         });
 92     });
 93 }
 94 
 95 function getArcUrl( arcList ){
 96     for( var key in arcList ){
 97         for( var k in arcList[key] ){
 98             aUrl.push( arcList[key][k]['url'] );
 99         }
100     }
101     crawlerArc( aUrl.shift() );
102 }
103 
104 var url = 'http://www.cnblogs.com/ghostwu/';
105 crawler( url );

The layout.jade file:

doctype html
html
    head
        meta(charset='utf-8')
        title jade+node.js express
        link(rel="stylesheet", href='./css/bower_components/bootstrap/dist/css/bootstrap.min.css')
    body
        block header
            div.container
                div.well.well-lg
                    h3 ghostwu Blogs
                    p js Master's Road
        block container
            div.container
                h3
                    a(href="#{href}") !{title}
                p !{body}
        block footer
            div.container
                footer Copyright - by ghostwu

Follow-up plans:

1. Using mongodb to store

2. Supporting breakpoint acquisition

3. Collect pictures

4. Collecting Novels

Wait....

Posted by dbradbury on Thu, 13 Dec 2018 02:27:07 -0800