Hand-on instructions for crawling data with nodejs

Keywords: Javascript JSON Mongoose npm MongoDB

1. Target of this crawl

To crawl data from websites and analyze and organize it, my goal is to completely recreate a webapp site for practice, so I'll think about how inputs are stored in MongoDB.The crawled data is temporarily stored in the json file first; of course, it can be stored directly in your own MonoDB, so the duplicated website is a complete website. As for video playback, you can crawl the video address and write it to a specific field. The user call is to take the video from the original website directly (just crawl all the data from the free course).

) Run the meeting to another json file with the following structure

Free.json (Record course directions and classifications)

FreeCourse.json (Files recording all courses, but data will have IDs that correspond to categories in free)

2. Create Project

1) Create a craler directory, and then create a new index.js file.Execute the npm init command in the directory and then go back all the way.

2) Install npm package, execute npm install mongoose (id for generating MongoDB), npm install https, npm install cheerio; get the following directory structure

3. Get data on the classification and direction of the course

const mongoose = require('mongoose');
const https = require('https');
const fs = require('fs');
const cheerio = require('cheerio');

var courses = [],
    totalPage = 0,
    orientations = [],//Course Direction
    orientationMap = {},
    classifyMap = {},//Classification of courses
    baseUrl = 'https://www.imooc.com',
    freeCourseUrl = 'https://www.imooc.com/course/list'; //Free course
    function fetchPage(url){
      getFreeType(url);
  }

  //Type acquisition
  function getFreeType(url) {
      https.get(url, function(res){
          var html = '';
          res.setEncoding('utf-8')//Prevent Chinese random code
          res.on('data' ,function(chunk){
              html += chunk;
          })
          //Listens for end events and executes callback functions when the html of the entire page content is retrieved
          res.on('end',function(){
              var $ = cheerio.load(html);
              var orientationUrl = []//Get course direction url
                  ,orientationTag = $($('.course-nav-row')[0]);
              orientationTag.find('.course-nav-item').each(function(index,ele){
                  if(index == 0){return}
                  //Course Direction Data Sheet
                  let orientationTemp ={
                      _id:new mongoose.Types.ObjectId(),
                      name: $(ele).find('a').text().trim(),
                      tagkey: $(ele).find('a').attr('href').split('=').pop(),
                  }
                  orientations.push(orientationTemp)
                  orientationUrl.push('https://m.imooc.com' + $(ele).find('a').attr('href'));
              });
              getOrientationPage(orientationUrl);//Get classification in each direction
          })
      })
  }
//Get classification in each direction
function getOrientationPage(orientationUrl){
  var promiseList = [];
  orientationUrl.forEach((url,index) => {
      var prom = new Promise(function(resolve,reject){
          https.get(url,function(res){
              var html = '';
              res.setEncoding('utf-8');
              res.on('data',function(chunk){
                  html += chunk;
              });
              res.on('end',function(){
                  var $ = cheerio.load(html);
                  var classifications = [];
                  $('.course-item-w a').each((ind,ele) => {
                      var classTemp = {
                          _id: new mongoose.Types.ObjectId(),
                          name: cheerio.load(ele)('.course-label').text().trim(),
                          tagkey: $(ele).attr('href').split('=').pop(),
                          img: $(ele).find('.course-img').css('background-image').replace('url(','').replace(')',''),
                      };
                      classifications.push(classTemp);
                      classifyMap[classTemp.name] = classTemp.tagkey;
                      orientationMap[classTemp.name] = orientations[index].tagkey;
                  })

                  orientations[index].classifications = classifications;
                  resolve('done');
              })                

          }).on('error',function(err){
              reject(err)
          })
      })
      promiseList.push(prom);
  });
  Promise.all(promiseList).then(arr => {
      console.log('Type data collection completed');
      getFreeCourses(freeCourseUrl);
      fs.writeFile('./free.json',JSON.stringify(orientations),function(err){
          if(err){
              console.log(err);
          }else{
              console.log('free.json File written successfully!');
          }
      });
  });
}

4. Get all courses

Because there are more than 800 courses in this course, including paging, it is not possible to get all courses at once.I started with recursive requests and asked for the next lesson after each request was processed, which was also possible, but that was too slow; it didn't take advantage of Ajax asynchronism at all, so I used the es6 syntax and solved the problem with promise.all.Request 10 pages at a time (if you have more node services, you can't handle them, you will make all kinds of errors, and at present you can't think of any other good way to forget your gods and teaches).For each 10 requests processed, the next page is requested until completed.It will take about 3 minutes for more than 800 courses to be processed in this way.

//Get Courses
function getFreeCourses(url,pageNum) {
  https.get(url, function(res){
      var html = '';
      if(!pageNum){
          pageNum = 1;
      }
      res.setEncoding('utf-8')//Prevent Chinese random code
      res.on('data' ,function(chunk){
          html += chunk;
      })
      //Listens for end events and executes callback functions when the html of the entire page content is retrieved
      res.on('end',function(){
          let $ = cheerio.load(html);
          if(pageNum == 1){
              let lastPageHref = $(".page a:contains('End Page')").attr('href');
              totalPage = Number(lastPageHref.split('=').pop());
          }
          $('.course-card-container').each(function(index,ele){
              let $ele = cheerio.load(ele);
              let cover_origin = $ele('img.course-banner').attr('src');
              let cover = cover_origin.split('/').pop();
              let classifyName = [];
              let classify = [];
              let orientation = [];
              let hardAndMember = $ele('.course-card-info span');
              $ele('.course-label label').each((index,item) => {
                  var typeTxt = cheerio.load(item).text();
                  classifyName.push(typeTxt);
                  classify.push(classifyMap[typeTxt]);
                  orientation.push(orientationMap[typeTxt])
              });
              let obj ={
                  _id: new mongoose.Types.ObjectId(),
                  name: $ele("h3.course-card-name").text(),
                  cover: cover,
                  cover_origin: cover_origin,
                  banner:'',
                  banner_origin:'',
                  hard: cheerio.load(hardAndMember[0]).text(),//difficulty
                  duration: '',//Duration
                  member: Number(cheerio.load(hardAndMember[1]).text()),//Number of people
                  grade: '',//score
                  abstract: $ele('.course-card-desc').text(),//brief introduction
                  chapter: [],
                  authorid: '',//author
                  imglist: [],//Picture details
                  detail: '',//details
                  orientation: orientation,
                  classify: classify,
                  classifyName: classifyName,
                  detailUrl: baseUrl + $ele('a.course-card').attr('href')
              };
              courses.push(obj);
          });
          if(pageNum == totalPage){
            console.log('Total length of course'+courses.length);
            pageGetDetail();
          }else{
              let nextPageNum = pageNum + 1;
              let urlParams = freeCourseUrl + '?page=' + nextPageNum;
              getFreeCourses(urlParams,nextPageNum);
          }
      })       
  })
}
function pageGetDetail(startPage){
  if(!startPage)startPage = 0;
  let page = 10;
  let eachArr = courses.slice(startPage*page,(startPage+1)*page);
  let promiseList = [];
  eachArr.forEach((course,index)=> {
      var promise = new Promise(function(resolve,reject){
          https.get(course.detailUrl,res =>{
              let html = '';
              res.setEncoding('utf-8');
              res.on('data',function(chunk){
                  html += chunk;
              });
              res.on('end',function(){
                  let $ = cheerio.load(html);
                  let chapter = [];
                  let chapterEle = $('.course-chapters .chapter');
                  let teacherEle = $('.teacher-info');
                  var element = courses[startPage*page+index];
                  chapterEle.each(function(ind,ele){
                      let $ele = cheerio.load(ele);
                      let chapterList = [];
                      $ele('.video .J-media-item').each(function(ind,item){
                          let txt = cheerio.load(item).text();
                          txt = txt.replace('Start Learning');
                          txt = txt.trim();
                          chapterList.push({
                              txt: txt.replace(/[ \r\n]/g, ""),
                              type:'vedio'
                          });
                      })
                      let tempObj ={
                          header: $ele('h3').text().replace(/[ \r\n]/g, ""),
                          desc: $ele('.chapter-description').text().replace(/[ \r\n]/g, ""),
                          list: chapterList
                      }
                      chapter.push(tempObj);
                  })

                  element.duration = $('.static-item')[1] ? cheerio.load($('.static-item')[1])(".meta-value").text() : '';
                  element.grade = $('.score-btn .meta-value')[0] ? cheerio.load($('.score-btn .meta-value')[0]).text() : '';
                  element.intro = $('.course-description').text().replace(/[ \r\n]/g, "");
                  element.notice = $('.course-info-tip .first .autowrap').text();
                  element.whatlearn = $('.course-info-tip .autowrap')[1] ? cheerio.load($('.course-info-tip .autowrap')[1]).text() : "";
                  element.chapter = chapter;
                  element.teacher = {
                      name: teacherEle.find('.tit').text().trim(),
                      job: teacherEle.find('.job').text(),
                      imgSrc: 'http' + $('.teacher-info>a>img').attr('src'),
                      img: $('.teacher-info>a>img').attr('src')?$('.teacher-info>a>img').attr('src').split('/').pop():''
                  };
                  element.teacherUrl = baseUrl + $('.teacher-info>a').attr('href'),
                  element.questionUrl =  baseUrl + $('#qaOn').attr('href');
                  element.noteUrl = baseUrl + $('#noteOn').attr('href');
                  element.commentUrl = baseUrl + $('#commentOn').attr('href');

                  delete element.detailUrl;
                  resolve('done');
              })
          })
      });
      promiseList.push(promise);
  });
  Promise.all(promiseList).then(arr => {
      if( startPage * 10 > courses.length){
          fs.writeFile('./freeCourses.json',JSON.stringify(courses),function(err){
              if(err){
                  console.log(err);
              }else{
                  console.log('freeCourses.json File written successfully!');
              }
          });
      }else{
          let currentPage = startPage + 1;
          pageGetDetail(currentPage);
      }    
  })
}

  getFreeType(freeCourseUrl);

5. Executing Programs

Once the code is written, execute the node index directly in the crawler directory.I use vscode for thieves.If the following results are printed, they are successful.
    ![](https://s1.51cto.com/images/blog/201904/19/77d77d1388320636e9fe1a4e41b77ff5.png?x-oss-process=image/watermark,size_16,text_QDUxQ1RP5Y2a5a6i,color_FFFFFF,t_100,g_se,x_10,y_10,shadow_90,type_ZmFuZ3poZW5naGVpdGk=)

6. Summary

It feels great to finish this crawl, and the data is successfully written to the database as background data for the website.The difficulty in the process is to process the data, because although I am a trainer this time, I still treat it as a normal project to do, data is to provide front-end use, so be careful when collating.In short, as a yard farmer, continuous learning is necessary

Posted by bulldorc on Sat, 11 May 2019 09:01:17 -0700

Programmer Group