One click crawling result of Java crawler and save it as Excel

Keywords: Java Apache Excel Gradle

One click crawling result of Java crawler and save it as Excel

Save crawling results as an Excel table
There is no official tutorial for exporting Excel. Here I will send a tutorial for exporting to Excel

Guide bag

I like to use Gradle because of my hobbies, so I'll use the Gradle configuration here

//Reptile pack
compile group: 'us.codecraft', name: 'webmagic-core', version: '0.7.3'
compile group: 'us.codecraft', name: 'webmagic-extension', version: '0.7.3'

//poi package office operation
compile group: 'org.apache.poi', name: 'poi', version: '4.0.1'

maven's word, too

<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>4.0.1</version>
</dependency>

<!-- Reptile pack -->
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.7.3</version>
</dependency>
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.7.3</version>
</dependency>

Realization principle

Here I implement the Pipeline interface, which saves the results
Save from this interface
The save() command here adds a synchronized command to prevent thread safety problems when multithreading

Implementation code

Now look at the Java code

import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.FilePersistentBase;


import java.io.FileOutputStream;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;


/**
 * Crawler crawling results export to Excel
 * extends
 *  FilePersistentBase   The parent class of file saving provides convenient operations such as directory creation
 *
 * implements: 
 *  PageProcessor   Page operation of crawler
 *  Pipeline   Result operation of crawling
 */
public class WebmagicAndPoiDemo extends FilePersistentBase implements PageProcessor,Pipeline {
    public WebmagicAndPoiDemo(){
        logger = LoggerFactory.getLogger(getClass());
        site = Site.me().setTimeOut(1000).setRetryTimes(3);
        //Set save path
        setPath("G:\\IdeaProjects\\WebMagicDemo\\Temp\\");
        filename = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()) +".xls";
        //Create a workbook object
        workbook = new HSSFWorkbook();//You can also set the Name of the sheet here
        //Create sheet object
        sheet = workbook.createSheet("Crawling results");

        //Create rows for sheet
        HSSFRow row = sheet.createRow(rows);
        row.createCell(0).setCellValue("id");
        row.createCell(1).setCellValue("Name");
        row.createCell(2).setCellValue("Connection address");
        rows++;

    }

    private String filename;
    private int rows = 0;
    private HSSFWorkbook workbook;
    private HSSFSheet sheet;
    private Site site;
    private Logger logger;


    @Override
    public Site getSite() {
        return site;
    }


    @Override/** This is the Pipeline method**/
    public void process(ResultItems resultItems, Task task) {
        List<String> hrefs = resultItems.get("href");
        List<String> texts = resultItems.get("text");
        logger.debug(hrefs.toString());
        logger.debug(texts.toString());

        for (int i=0;i<hrefs.size();i++){
            //Create rows for sheet
            HSSFRow row = sheet.createRow(rows);
            row.createCell(0).setCellValue(rows);
            row.createCell(2).setCellValue(hrefs.get(i));
            row.createCell(1).setCellValue(texts.get(i));
            rows++;
        }
        save();
    }
    /** Save table**/
    private synchronized void save() {
        try {
            //Document output
            FileOutputStream out = new FileOutputStream(getFile(this.path).getPath()+"\\"+filename);
            workbook.write(out);
            out.close();
            logger.info(this.path+"\\"+filename+"Storage completed");
        } catch (IOException e) {
            logger.warn("Storage failure", e);
        }
    }


    @Override/** This is the method of PageProcessor**/
    public void process(Page page) {
        Selectable html = page.getHtml();
        Selectable href = html.$(".postTitle2","href");
        Selectable text = html.$(".postTitle2","text");
        page.putField("href",href.all());
        page.putField("text",text.all());
    }


    public static void main(String[] args) {
        //Crawling your own blog posts
        WebmagicAndPoiDemo app = new WebmagicAndPoiDemo();
        Spider.create(app).addPipeline(app)
        .addUrl("https://www.cnblogs.com/xiaoshuai123/").thread(1).run();
    }
}

Posted by awais_ciit on Sun, 12 Jan 2020 07:17:13 -0800