Java uses selenium to crawl the source code of web page and send it by email

Keywords: Selenium Spring Session Java

1. Project construction: idea + gradle + springboot
  build.gradle: introduce selenium related dependency

apply plugin: 'java'
apply plugin: 'maven'
apply plugin: 'idea'
apply plugin: 'org.springframework.boot'

sourceCompatibility = 1.8
group = 'com.boom.basement'

def version = '1.0.0.RELEASE'
def artifactId = 'boom-selenium'

buildscript {
    ext {
        springBootVersion = '1.5.12.RELEASE'
    }
    repositories {
        maven { url 'http://maven.aliyun.com/nexus/content/groups/public/' }
    }
    dependencies {
        classpath("org.springframework.boot:spring-boot-gradle-plugin:${springBootVersion}")
        classpath("org.springframework.boot:spring-boot-maven-plugin:${springBootVersion}")
    }
}

repositories {
    maven { url 'http://maven.aliyun.com/nexus/content/groups/public/' }
}

dependencies {
    // springboot
    compile "org.springframework.boot:spring-boot-starter:$springBootVersion"
    compile "org.springframework.boot:spring-boot-starter-web:$springBootVersion"
    compile "org.springframework.boot:spring-boot-starter-thymeleaf:$springBootVersion"
    // Mail delivery
//    compile "org.springframework.boot:spring-boot-starter-mail:$springBootVersion"
    compile group: 'javax.mail', name: 'mail', version: '1.4.7'
    // selenium
    compile group: 'org.seleniumhq.selenium', name: 'selenium-api', version: '3.141.59'
    compile group: 'org.seleniumhq.selenium', name: 'selenium-remote-driver', version: '3.141.59'
    compile('org.seleniumhq.selenium:selenium-chrome-driver:3.141.59') {
        exclude module: 'selenium-api'
        exclude module: 'selenium-remote-driver'
    }
    // pdf
    compile 'com.itextpdf:itextpdf:5.4.2'
    compile 'org.xhtmlrenderer:flying-saucer-pdf:9.0.8'
}

  note:
                  ① the version of selenium API and selenium remote driver that the selenium chrome driver package automatically depends on is not the same as the version of 3.141.59. You need to manually introduce the correct version after exclusion
   ② javax.mail is required. Spring boot starter mail integrated with spring boot may lead to the timeout of mailbox server connection (the specific reason is unknown)
   ③ using 9.0.8 for the version of flying saucer pdf
  main startup class:

@SpringBootApplication
public class BoomSeleniumApplication {
    public static void main(String[] args) {
        SpringApplication.run(BoomSeleniumApplication.class, args);
    }
}

2. Write controller: complete the source code crawling and conversion to pdf and send by email, etc

import com.lowagie.text.DocumentException;
import com.sun.mail.util.MailSSLSocketFactory;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.xhtmlrenderer.pdf.ITextRenderer;

import javax.activation.DataHandler;
import javax.activation.FileDataSource;
import javax.mail.*;
import javax.mail.internet.*;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.security.GeneralSecurityException;
import java.util.Properties;

@Controller
@RequestMapping("/selenium")
public class SeleniumController {

    @RequestMapping("/index")
    public String index() {
        // Test page
        return "index";
    }

    @RequestMapping("/sendMail")
    public void sendMail() throws GeneralSecurityException, MessagingException, IOException, DocumentException {
        // Set environment variable: indicates the location of the chrome driver. The chrome driver must be downloaded in advance and placed in the specified location
        System.setProperty("webdriver.chrome.driver", "d:\\chromedriver.exe");

        ChromeOptions chromeOptions = new ChromeOptions();
        // Set to headless mode: must be set to headless mode
        chromeOptions.addArguments("--headless");
//        chromeOptions.addArguments("--disable-gpu");
        // Set browser window size
        chromeOptions.addArguments("--window-size=1920,1080");
        // Equivalent to creating a virtual browser
        WebDriver driver = new ChromeDriver(chromeOptions);
        // Equivalent to entering the web address in the browser and returning
        driver.get("http://localhost:8080/selenium/index");
        // Crawl the source code of the web page
        String pageSource = driver.getPageSource();
        // Call method to convert source code to pdf
        createPDF(new FileOutputStream("d:\\index.pdf"), pageSource);
        // Parameter settings related to mail sending
        Properties props = new Properties();
        // Enable debug debugging
        props.setProperty("mail.debug", "true");
        // Send server requires authentication
        props.setProperty("mail.smtp.auth", "true");
        // Set mail server host name
        props.setProperty("mail.host", "smtp.qq.com");
        // Send mail protocol name
        props.setProperty("mail.transport.protocol", "smtp");

        MailSSLSocketFactory sf = new MailSSLSocketFactory();
        sf.setTrustAllHosts(true);
        props.put("mail.smtp.ssl.enable", "true");
        props.put("mail.smtp.ssl.socketFactory", sf);

        // Create session
        Session session = Session.getInstance(props);

        // Create message information from session
        Message msg = new MimeMessage(session);
        // Mail theme
        msg.setSubject("JavaMail Test");

//        //Create picture node
//        MimeBodyPart image = new MimeBodyPart();
//        //Read local file
//        DataHandler dataHandler = new DataHandler(new FileDataSource("src/123.jpg"));
//        //Add picture to node
//        image.setDataHandler(dataHandler);
//        //Set a unique number for nodes
//        image.setContentID("pic");

        // Create text "node"
        MimeBodyPart text = new MimeBodyPart();
        // Use the crawled source code as the body of the message
        text.setContent(pageSource, "text/html;charset=UTF-8");

        // Create attachment node
        MimeBodyPart attachment = new MimeBodyPart();
        // Read local file: pdf file to convert source code to
        DataHandler dataHandler2 = new DataHandler(new FileDataSource("d:\\index.pdf"));
        // Add file to node
        attachment.setDataHandler(dataHandler2);
        // Set the file name of the attachment (encoding required)
        attachment.setFileName(MimeUtility.encodeText(dataHandler2.getName()));

        // Create a hybrid node to add the image node file node attachment node
        MimeMultipart multipart = new MimeMultipart();
//        multipart.addBodyPart(image);
        multipart.addBodyPart(text);
        multipart.addBodyPart(attachment);
        // Add hybrid node to message
        msg.setContent(multipart);

        // Set mail sender
        msg.setFrom(new InternetAddress("xxxxxx@qq.com"));

        // Start session transfer
        Transport transport = session.getTransport();
        // Connect mailbox: specify mailbox and authorization code
        transport.connect("smtp.qq.com", "xxxxxx@qq.com", "xxxxxx");

        // Send mail to target mailbox
        transport.sendMessage(msg, new Address[]{new InternetAddress("xxxxx@qq.com")});
        transport.close();
    }

    /**
     * Convert html to pdf
     *
     * @param out
     * @param html
     * @throws IOException
     * @throws DocumentException
     */
    public static void createPDF(OutputStream out, String html) throws IOException, DocumentException {
        ITextRenderer renderer = new ITextRenderer();
        renderer.setDocumentFromString(html);
        // Resolve Chinese support issues
//        ITextFontResolver fontResolver = renderer.getFontResolver();
//        fontResolver.addFont("pdf/font/fangsong.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
//        fontResolver.addFont("pdf/font/PingFangSC.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
        renderer.layout();
        renderer.createPDF(out);
    }
}

  note: you need to download chromedriver.exe in advance. Download address: chromedriver
3. index.html: Please note that it must be in the resources/templates / folder, because SpringBoot sets the template location of thymeleaf to this location by default, otherwise you need to manually configure the relevant parameters of thymeleaf

<!DOCTYPE html>
<html lang="en" xmlns:th="http://www.thymeleaf.org">
    <head>
        <title>Title</title>
    </head>
    <body>
        <a href="https://www.baidu.com">Hello</a>
    </body>
</html>

  note: try not to have tags such as < link > and < meta > in HTML, which will affect the conversion of pdf. When html is used as the body of an email, the email manufacturer does not support outreach for security reasons. For details, please refer to: HTML mail compatibility issues

309 original articles published, 47 praised, 70000 visited+
Private letter follow

Posted by John_wilson on Thu, 16 Jan 2020 10:09:47 -0800