One click crawling result of Java crawler and save it as Excel
Save crawling results as an Excel table
There is no official tutorial for exporting Excel. Here I will send a tutorial for exporting to Excel
Guide bag
I like to use Gradle because of my hobbies, so I'll use the Gradle configuration here
//Reptile pack compile group: 'us.codecraft', name: 'webmagic-core', version: '0.7.3' compile group: 'us.codecraft', name: 'webmagic-extension', version: '0.7.3' //poi package office operation compile group: 'org.apache.poi', name: 'poi', version: '4.0.1'
maven's word, too
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>4.0.1</version> </dependency> <!-- Reptile pack --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency>
Realization principle
Here I implement the Pipeline interface, which saves the results
Save from this interface
The save() command here adds a synchronized command to prevent thread safety problems when multithreading
Implementation code
Now look at the Java code
import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.*; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.FileOutputStream; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; /** * Crawler crawling results export to Excel * extends * FilePersistentBase The parent class of file saving provides convenient operations such as directory creation * * implements: * PageProcessor Page operation of crawler * Pipeline Result operation of crawling */ public class WebmagicAndPoiDemo extends FilePersistentBase implements PageProcessor,Pipeline { public WebmagicAndPoiDemo(){ logger = LoggerFactory.getLogger(getClass()); site = Site.me().setTimeOut(1000).setRetryTimes(3); //Set save path setPath("G:\\IdeaProjects\\WebMagicDemo\\Temp\\"); filename = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()) +".xls"; //Create a workbook object workbook = new HSSFWorkbook();//You can also set the Name of the sheet here //Create sheet object sheet = workbook.createSheet("Crawling results"); //Create rows for sheet HSSFRow row = sheet.createRow(rows); row.createCell(0).setCellValue("id"); row.createCell(1).setCellValue("Name"); row.createCell(2).setCellValue("Connection address"); rows++; } private String filename; private int rows = 0; private HSSFWorkbook workbook; private HSSFSheet sheet; private Site site; private Logger logger; @Override public Site getSite() { return site; } @Override/** This is the Pipeline method**/ public void process(ResultItems resultItems, Task task) { List<String> hrefs = resultItems.get("href"); List<String> texts = resultItems.get("text"); logger.debug(hrefs.toString()); logger.debug(texts.toString()); for (int i=0;i<hrefs.size();i++){ //Create rows for sheet HSSFRow row = sheet.createRow(rows); row.createCell(0).setCellValue(rows); row.createCell(2).setCellValue(hrefs.get(i)); row.createCell(1).setCellValue(texts.get(i)); rows++; } save(); } /** Save table**/ private synchronized void save() { try { //Document output FileOutputStream out = new FileOutputStream(getFile(this.path).getPath()+"\\"+filename); workbook.write(out); out.close(); logger.info(this.path+"\\"+filename+"Storage completed"); } catch (IOException e) { logger.warn("Storage failure", e); } } @Override/** This is the method of PageProcessor**/ public void process(Page page) { Selectable html = page.getHtml(); Selectable href = html.$(".postTitle2","href"); Selectable text = html.$(".postTitle2","text"); page.putField("href",href.all()); page.putField("text",text.all()); } public static void main(String[] args) { //Crawling your own blog posts WebmagicAndPoiDemo app = new WebmagicAndPoiDemo(); Spider.create(app).addPipeline(app) .addUrl("https://www.cnblogs.com/xiaoshuai123/").thread(1).run(); } }