Python multithreaded crawler is suitable for IO intensive tasks, which involve network and disk IO tasks. Multithreading can significantly improve efficiency, such as multithreaded crawler, multithreaded file processing, etc. CPU intensive tasks are not suitable for multithreading.
Idea: put all URLs in the queue. When io is not blocked, use threads to get data from the queue. When all URLs are processed, exit directly
#!/usr/bin/env python #-*- coding:utf-8 -*- #env:python3.X import threading, queue, time, requests from urllib.request import urlopen from bs4 import BeautifulSoup res = requests.get('http://land.fang.com/market/210100________1_0_1.html') soup = BeautifulSoup(res.text,'html.parser') urlQueue = queue.Queue() for message in soup.select('.list28_text'): url = 'http://land.fang.com' + message.select('a')['href'] urlQueue.put(url) def fetchUrl(urlQueue): while True: try: url = urlQueue.get_nowait() #Nonblocking read queue data i = urlQueue.qsize() #Queue length, take out one length and reduce one except Exception as e: break #When it is finished, exit the cycle #print ('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, url)) try: response = urlopen(url) responseCode = response.getcode() #Get the returned status code except Exception as e: continue if responseCode == 200: #The data processing of grabbing content can be put here detail = requests.get(url) soup1 = BeautifulSoup(detail.text, 'html.parser') messes =  for mess in soup1.select('.banbox tr td'): messes.append(mess.text) print(messes[1:3]) #time.sleep(1) if __name__ == '__main__': start = time.time() threads =  threadNum = 10 for i in range(0, threadNum): t = threading.Thread(target=fetchUrl, args=(urlQueue,)) threads.append(t) t.start() for t in threads: t.join() end = time.time() print ('the total time is: %s ' % (end - start))