Python 3 multithreaded crawler

Keywords: Python network

Python multithreaded crawler is suitable for IO intensive tasks, which involve network and disk IO tasks. Multithreading can significantly improve efficiency, such as multithreaded crawler, multithreaded file processing, etc. CPU intensive tasks are not suitable for multithreading.

Idea: put all URLs in the queue. When io is not blocked, use threads to get data from the queue. When all URLs are processed, exit directly

#!/usr/bin/env python
#-*- coding:utf-8 -*-
#env:python3.X

import threading, queue, time, requests
from  urllib.request import urlopen
from bs4 import BeautifulSoup
res = requests.get('http://land.fang.com/market/210100________1_0_1.html')
soup = BeautifulSoup(res.text,'html.parser')
urlQueue = queue.Queue()
for message in soup.select('.list28_text'):
    url = 'http://land.fang.com' + message.select('a')[0]['href']
    urlQueue.put(url)

def fetchUrl(urlQueue):
    while True:
        try:
            url = urlQueue.get_nowait()   #Nonblocking read queue data
            i = urlQueue.qsize()          #Queue length, take out one length and reduce one
        except Exception as e:
            break         #When it is finished, exit the cycle
        #print ('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, url))
        try:
            response = urlopen(url)
            responseCode = response.getcode()   #Get the returned status code
        except Exception as e:
            continue
        if responseCode == 200:
            #The data processing of grabbing content can be put here
            detail = requests.get(url)
            soup1 = BeautifulSoup(detail.text, 'html.parser')
            messes = []
            for mess in soup1.select('.banbox tr td'):
                messes.append(mess.text)
            print(messes[1:3])
            #time.sleep(1)
if __name__ == '__main__':
    start = time.time()
    threads = []
    threadNum = 10
    for i in range(0, threadNum):
        t = threading.Thread(target=fetchUrl, args=(urlQueue,))
        threads.append(t)
        t.start()
    for t in threads:
        t.join()
    end = time.time()
    print ('the total time is: %s ' %  (end - start))

Posted by nsantos on Tue, 05 May 2020 07:57:39 -0700