Simple JAVA crawler 51Jobs

Keywords: Java encoding JQuery

Using the Jsoup tool, it is an HTML parser that can directly parse an address or HTML file. You can also manipulate data through Dom,CSS, and operation methods of type JQuery.

Official document address of Jsoup: https://jsoup.org/cookbook/introduction/parsing-a-document

Note: in case of garbled code, you need to check the encoding method of the webpage and use its encoding method to decode. When using forms to transmit Chinese data, some websites need url encoding to transmit Chinese =. =

The main codes are as follows:

package com.galoliy.spider.maven_spider.domain;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Cat5jobs {

    public Document getResultPage(String url,String keyword) throws UnsupportedEncodingException {
        Document doc = null;
        
        //multipart/form-data Code type conversion, conversion required,Otherwise, it will lead to POST Li keyword Random code
        //Multipart/form-data code type conversion must be converted, otherwise it will cause keyword confusion in POST.
        keyword = URLEncoder.encode(keyword, "gbk");
        
        try {
            
            //Get home page
            //Get index page
            Response resp = Jsoup.connect(url).method(Method.GET).execute();
            doc = resp.parse();
        
            //Get the jump link of the query result page
            //Get query results jump page link
            String    actionPath = doc.select("form").attr("action");
            
             Connection con = Jsoup.connect(actionPath)
                    .data("keyword", keyword)
                    .userAgent("Mozilla")
                    .cookies(resp.cookies())
                    .header("Accept-Language", "zh-CN,zh;q=0.9")
                    .timeout(300000);
             //Get query results page
             //Get query results page
            doc = con.method(Method.POST).execute().parse();
            
        } catch (IOException e) {
            e.printStackTrace();
        }
        return doc;
    }
    

    public void getResult(String url,String keyword,String dir,String fileName) {

        Document doc = null;
        File path = null;
        String htmlFilePath = dir + fileName + ".htm";
        String txtFilePath = dir + fileName + "2.txt";
        try {
            
            path = new File(htmlFilePath);
            doc = Jsoup.parse(path, "utf-8");
            if(!doc.children().isEmpty()) {
                System.out.println("File not empty");
            }
            
        } catch (IOException e) {
            
            System.out.println("file not found");

            try {
                
                //Get query results page from website
                //Get query results page from web address
                doc = this.getResultPage(url,keyword);
                
            } catch (UnsupportedEncodingException e2) {
                e2.printStackTrace();
            }
            try {
                path.createNewFile();
                saveFile(doc.toString(),htmlFilePath);
                
            } catch (IOException e1) {
                e1.printStackTrace();
            }
        }
        
        Map map = Screen51Jobs(doc);

        try {
            saveScreen(map, txtFilePath);
            
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    
    private void saveScreen(Map<?,?> screen,String  path) throws IOException {
        
        StringBuilder sb = new StringBuilder();
        String p = "\r\n";
        sb.append(p + " KeyWord:" + screen.get("keyword") + p + p +" Total query data:" 
                    + screen.get("totalquerydata") + p + p + " Recruitment info:");
        
        List list = (ArrayList)screen.get("recruitmentlist");

        for (Object o : list) {
            Map map = (HashMap<String,Object>)o;

            for (Object obj : map.entrySet()) {
                Map.Entry<String, Object> entry = (Map.Entry<String, Object>)obj;
                sb.append(p + entry.getKey() + " == " + entry.getValue());
            }
            sb.append(p);
        }
        
        File f = new File(path);
        if(!f.exists()) {
            f.createNewFile();
        }
        System.out.println(sb.toString());
        saveFile(sb.toString(), path);
    }
    
    @SuppressWarnings({ "rawtypes", "unchecked" })
    private Map<?,?> Screen51Jobs(Document doc){
        
        Map screen = new HashMap<String,Object>(); 
        
        Elements resultList = doc.select("div[class=dw_table]div[id=resultList]");
        Elements findKeyword = resultList.select("div[class=sbox]");
        Elements totalQueryData = resultList.select("div[class=rt]div:matchesOwn(^common)");
        Elements recruitmentInfo = resultList.select("div[class=el]");
    
        screen.put("keyword", findKeyword.text());
        screen.put("totalquerydata", totalQueryData.text());
        
        List recruitmentList = new ArrayList<Map<String,String>>(); 
        Map m = null;
        for (Element e : recruitmentInfo) {
            m = new HashMap<String,Object>();
            m.put("position",e.select("p[class~=^t1]").text());
            m.put("href", e.select("a").attr("href"));
            m.put("corporatename", e.select("a").text());
            m.put("address", e.select("span[class=t3]").text());
            m.put("salary", e.select("span[class=t4]").text());
            m.put("releasedate", e.select("span[class=t5]").text());
            recruitmentList.add(m);
        }
        screen.put("recruitmentlist", recruitmentList);
        
        return screen;
    }
    
    private void saveFile(String src,String path) throws IOException {

    //    InputStream in = new FileInputStream(path);
        OutputStream out = new FileOutputStream(path);
        BufferedOutputStream bos = new BufferedOutputStream(out);
        
        byte[] bytes = src.getBytes("utf-8");
        
        bos.write(bytes, 0, bytes.length);        
    }
}

Posted by RootKit on Thu, 02 Jan 2020 08:54:52 -0800