Python multi thread crawling bucket map expression pack

I. application technology:

  • Lxml: parsing web pages

  • Requests Library: get web page information

  • re: replace illegal characters

  • os: process filename

  • Queue: Secure multithreading

  • Urlib: download the acquired pictures

II. Design idea:

Here, the producer consumer model is used to design multithreading. The consumer is responsible for parsing the web page and getting the url of all the pictures on the web page, while the consumer is responsible for downloading the pictures to the local for IO operation. Here, five consumers and five producers are designed.

Three, Demo:

import requests
from lxml import etree
import os
import re
from urllib import request
from queue import Queue
import threading
        'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Mobile Safari/537.36'
class Producers(threading.Thread):
    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Producers, self).__init__(*args, **kwargs)
        self.pq = page_queue = img_queue
    def run(self):
        while True:
            if self.pq.empty():
            url = self.pq.get()
    def getHtml(self, url):
        r = requests.get(url, headers=HEADRES)
        r.encoding = r.apparent_encoding
        return r.text
    def parse_page(self, url):
        text = self.getHtml(url)
        html = etree.HTML(text)
        imgs = html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
        imgurls = []
        alts = []
        for img in imgs:
            img_url = img.get('data-original')
            alt = img.get('alt')
            if img_url not in imgurls:
                imgurls.append(img_url)  # The URLs that are crawled down have two copies of each one. Once they exist, they will not be added.
            if alt not in alts:
        for value in zip(imgurls, alts):
            imgurl, alt = value
            alt1 = re.sub(r'[\??\.,. !!*]', '', alt)  # windows filenames cannot have these characters
            suffix = os.path.splitext(imgurl)[1]
            filename = alt1 + suffix
  , filename))
class Customer(threading.Thread):
    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Customer, self).__init__(*args, **kwargs)
        self.pq = page_queue = img_queue
    def run(self):
        while True:
         if self.pq.empty() and
         imgurl, filename  =
         request.urlretrieve(imgurl, 'images/' + filename)
         print(filename+'Download completed')
if __name__ == '__main__':
    page_queue = Queue(100)
    img_queue = Queue(1000)
    for i in range(1, 50):
     url = ''+str(i)
    for x in range(5):
        t = Producers(page_queue, img_queue)
    for x in range(5):
        t = Customer(page_queue, img_queue)

Compared with single thread crawling speed, the performance improvement can be observed obviously.

