Unconsciously, I have been developing nodejs for more than a year. I learned a lot in the process and forgot a lot, so I still insist on leaving some content in the blog for later reference.
This article mainly realizes a simple website crawler function, grabs the website inside picture as well as the content.
Before grabbing, the modules of SUPERGEN, cheerio, request and fs are used.
In fact, the purpose of SUPERGEN and http is the same, both of them are used to simulate requests through URLs. In order to record the usage, both of them are used here. cheerio, I personally understand, is equivalent to jquery of a server, which can filter the response content, and the usage is the same as jquery. fs module is the file operation module in nodejs.
Take crawling 360 website as an example:
1 /** 2 * Internet worm 3 */ 4 5 let sup = require("superagent"); 6 let cheer = require("cheerio"); 7 let request = require("request"); 8 let fs = require("fs"); 9 const domain = "www.360.com"; 10 const url = "https://" + domain + "/"; 11 sup.get(url).end(function (err, res, body) { 12 if (err) { 13 console.log(err); 14 } else { 15 console.log('----------------------'); 16 //Find the text content of the page response 17 let $ = cheer.load(res.text); 18 //adopt cheerio Filter content usage and jquery Just like: find out all the pictures in it 19 let rst = $("img"); 20 rst.each(function (idx, item) { 21 //Here request.get Of encoding Be sure to set to null,api The document has instructions for null The response text will return a buffer 22 let imgUrl = (item.attribs.src.includes('//') ? 'http:' : url) + item.attribs.src; 23 console.log('********************', imgUrl); 24 request.get({ url: imgUrl, encoding: null }, function (error, response, body) { 25 if (error) { 26 console.error(error); 27 } else { 28 //Download data continuously 29 let dir = domain + "/" + item.attribs.src.replace('//', ''); 30 let path = dir.split('/'); 31 //Remove the last filename section 32 path.pop(); 33 if (fs.existsSync(path.join('/')) == false) { 34 //Cycle folder creation 35 var dirInfo = ""; 36 for (var i = 0; i < path.length; i++) { 37 dirInfo += path[i] + "/" 38 if (fs.existsSync(dirInfo) == false && dirInfo != '') 39 fs.mkdirSync(dirInfo); 40 } 41 } 42 //hold body Of. string Turn to object Write to file 43 fs.writeFile(dir, body, (err) => { 44 console.log('----------------', err); 45 }) 46 } 47 }) 48 }) 49 } 50 })
The above is a personal learning record. It's just a knowledge sharing point. If there's something wrong, please leave a message to point out.