I. application technology:
-
Lxml: parsing web pages
-
Requests Library: get web page information
-
re: replace illegal characters
-
os: process filename
-
Queue: Secure multithreading
-
Urlib: download the acquired pictures
II. Design idea:
Here, the producer consumer model is used to design multithreading. The consumer is responsible for parsing the web page and getting the url of all the pictures on the web page, while the consumer is responsible for downloading the pictures to the local for IO operation. Here, five consumers and five producers are designed.
Three, Demo:
import requests from lxml import etree import os import re from urllib import request from queue import Queue import threading HEADRES = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Mobile Safari/537.36' } class Producers(threading.Thread): def __init__(self, page_queue, img_queue, *args, **kwargs): super(Producers, self).__init__(*args, **kwargs) self.pq = page_queue self.iq = img_queue def run(self): while True: if self.pq.empty(): break url = self.pq.get() self.parse_page(url) def getHtml(self, url): r = requests.get(url, headers=HEADRES) r.encoding = r.apparent_encoding return r.text def parse_page(self, url): text = self.getHtml(url) html = etree.HTML(text) imgs = html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]') imgurls = [] alts = [] for img in imgs: img_url = img.get('data-original') alt = img.get('alt') if img_url not in imgurls: imgurls.append(img_url) # The URLs that are crawled down have two copies of each one. Once they exist, they will not be added. if alt not in alts: alts.append(alt) for value in zip(imgurls, alts): imgurl, alt = value alt1 = re.sub(r'[\??\.,. !!*]', '', alt) # windows filenames cannot have these characters suffix = os.path.splitext(imgurl)[1] filename = alt1 + suffix self.iq.put((imgurl, filename)) class Customer(threading.Thread): def __init__(self, page_queue, img_queue, *args, **kwargs): super(Customer, self).__init__(*args, **kwargs) self.pq = page_queue self.iq = img_queue def run(self): while True: if self.pq.empty() and self.iq.empty(): break imgurl, filename = self.iq.get() request.urlretrieve(imgurl, 'images/' + filename) print(filename+'Download completed') if __name__ == '__main__': page_queue = Queue(100) img_queue = Queue(1000) for i in range(1, 50): url = 'http://www.doutula.com/photo/list/?page='+str(i) page_queue.put(url) for x in range(5): t = Producers(page_queue, img_queue) t.start() for x in range(5): t = Customer(page_queue, img_queue) t.start()
Compared with single thread crawling speed, the performance improvement can be observed obviously.