Catalog
Compared with python, Java crawler is a bit more complex. A few lines of python code can grab a web page. Java may need dozens of lines or even more, so python has more advantages in terms of code volume. But Java is not impossible. Because we learn the basic syntax of Java, in order to consolidate our basic knowledge, we can master relevant knowledge through crawler Knowledge.
The java.net package of Java also provides a high-level network programming class - URL, through which Internet resources can be accessed.
1, Use URL class
Java's java.net.URL class is used to request resources on the Internet. HTTP/HTTPS protocol is used. The request method is GET method, which generally requests static, small amount of server-side data.
Common construction methods of URL class:
- URL(String spec): creates a URL object based on the string representation.
- URL(String protocol, String host, String file): creates a URL object based on the specified protocol name, host name, and file name.
- URL(String protocol, String host, int port, String file): creates a URL object based on the specified protocol name, host name, port number, and file name.
Common methods of URL class:
- InputStream openStream(): opens a connection to this URL and returns an input stream.
- URLConnection openConnection(): opens a new connection to this URL and returns a URLConnection object.
Here is how to use java.net.URL class to do simple web page grabbing. The example code is as follows:
mport java.io.*; import java.net.MalformedURLException; import java.net.URL; /** * @author : Cai Zheng Jie * @email :caizhengjie888@icloud.com * @date : 2020/2/21 * @time : 11:04 Afternoon */ //Java web crawler public class HelloWorld { public static void main(String[] args) { // Web web site String url = "https://www.sina.com.cn/"; URL reqURL = null; try { reqURL = new URL(url); } catch (MalformedURLException e) { e.printStackTrace(); } try ( // Open network communication input stream InputStream inputStream = reqURL.openStream(); InputStreamReader inputStreamReader = new InputStreamReader(inputStream,"utf-8"); BufferedReader bufferedReader = new BufferedReader(inputStreamReader); ){ StringBuilder sb = new StringBuilder(); String line = bufferedReader.readLine(); while (line !=null){ sb.append(line); sb.append('\n'); line = bufferedReader.readLine(); } // Log output System.out.println(sb); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
2, Use HttpURLConnection to send GET request
Because the URL class can only send HTTP/HTTPS GET method requests, if you want to send other situations or have more in-depth control over network requests, you can use the HttpURLConnection type.
The code is as follows:
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; /** * @author : Cai Zheng Jie * @email :caizhengjie888@icloud.com * @date : 2020/2/21 * @time : 11:29 Afternoon */ public class HelloWorld { static String urlString = "https://blog.csdn.net/weixin_45366499"; public static void main(String[] args) { BufferedReader br = null; HttpURLConnection conn = null; try { URL reqURL = new URL(urlString); // Open a connection with reqURL.openConnection() and return HttpURLConnection object conn = (HttpURLConnection) reqURL.openConnection(); // Request method set to get method conn.setRequestMethod("GET"); // Open network communication input stream InputStream is = conn.getInputStream(); // Create InputStreamReader object through is InputStreamReader isr = new InputStreamReader(is,"utf-8"); // Created by isr br = new BufferedReader(isr); StringBuilder sb = new StringBuilder(); String line = br.readLine(); while (line !=null){ sb.append(line); sb.append('\n'); line = br.readLine(); } // Output log System.out.println(sb); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (conn != null){ // Disconnect conn.disconnect(); } if (br != null){ try { br.close(); } catch (IOException e) { e.printStackTrace(); } } } } }
3, Use HttpURLConnection to send POST request
HttpURLConnection can also send HTTP/HTTPS POST requests. The following describes how to use HttpURLConnection to send POST requests.
import java.io.*; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; /** * @author : Cai Zheng Jie * @email :caizhengjie888@icloud.com * @date : 2020/2/22 * @time : 6:45 Afternoon */ public class HelloWorld { // web service website static String urlstring = "https://blog.csdn.net/weixin_45366499"; public static void main(String[] args) { BufferedReader br = null; HttpURLConnection conn = null; try { URL reqURL = new URL(urlstring); // Open a connection with reqURL.openConnection() and return HttpURLConnection object conn = (HttpURLConnection) reqURL.openConnection(); // HTTP request method is POST conn.setRequestMethod("POST"); // Parameters can be passed to the server during request setting conn.setDoInput(true); String parm = " "; // Setting parameters DataOutputStream dataOutputStream = new DataOutputStream(conn.getOutputStream()); dataOutputStream.writeBytes(parm); dataOutputStream.close(); // Open network communication input stream InputStream is = conn.getInputStream(); // Create InputStreamReader object through is InputStreamReader isr = new InputStreamReader(is,"utf-8"); // Create BufferedReader object through isr br = new BufferedReader(isr); StringBuilder sb = new StringBuilder(); String line = br.readLine(); while (line != null){ sb.append(line); sb.append('\n'); line = br.readLine(); } // Log output System.out.println(sb); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally { if (conn != null){ conn.disconnect(); } if (br != null){ try { br.close(); } catch (IOException e) { e.printStackTrace(); } } } } }
4, Download case
In order to get more familiar with the URL class, a download case program Downloader is introduced. Download online pictures
import java.io.*; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; /** * @author : Cai Zheng Jie * @email :caizhengjie888@icloud.com * @date : 2020/2/22 * @time : 7:55 Afternoon */ public class Downloader { // web service website static String urlString = "http://a3.att.hudong.com/68/61/300000839764127060614318218_950.jpg"; public static void main(String[] args) { dowmload(); } private static void dowmload(){ HttpURLConnection conn = null; try { // Create URL object URL requrl = new URL(urlString); // Open links conn= (HttpURLConnection) requrl.openConnection(); try ( // Get input flow from connection object InputStream is = conn.getInputStream(); BufferedInputStream bufferedInputStream = new BufferedInputStream(is); // Create a file output stream OutputStream os = new FileOutputStream("./download.jpg"); BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(os); ){ byte[] buffer = new byte[1024]; int bytesRead =bufferedInputStream.read(buffer); while (bytesRead !=-1){ bufferedOutputStream.write(buffer,0,bytesRead); bytesRead = bufferedInputStream.read(buffer); } }catch (IOException e){ e.printStackTrace(); } System.out.println("Download complete!"); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally { if (conn != null){ conn.disconnect(); } } } }
The result is to download a picture from the Internet to your own computer. If you are interested, you can try it.
The above content is for reference only. If there is infringement, please contact me to delete it!
If this article is helpful to you, the thumb in the lower left corner is the biggest encouragement to bloggers.
Your encouragement is the biggest power of bloggers!