structure
Producer generates web address and queues it
Multiple consumers pull web addresses from the queue
1 from queue import Queue 2 import time, threading, requests 3 4 url_base = 'http://www.qiushibaike.com/8hr/page/{}/' 5 header = {} 6 7 def load_data(): 8 return [url_base.format(i) for i in [1, 3, 6, 7]] 9 10 #Producer 11 def produce(q): 12 index = 0 13 data = load_data() 14 while True: 15 if index < len(data): 16 q.put(data[index]) 17 index += 1 18 19 #Consumer 20 def consume(q): 21 while True: 22 download_url = q.get() 23 # requests.get(download_url,headers=header) 24 print('thread is {} content is {}'.format(threading.current_thread(), download_url)) 25 26 def main(): 27 q = Queue(4) 28 p1 = threading.Thread(target=produce, args=[q]) 29 c1 = threading.Thread(target=consume, args=[q]) 30 c2 = threading.Thread(target=consume, args=[q]) 31 p1.start() 32 c1.start() 33 c2.start() 34 35 if __name__ == '__main__': 36 main()
class
Crawler class needs to inherit multithreaded class
Initialization method needs to inherit parent class initialization method
To create an object, start will call the run method in the class
1 # class ConsumeSpider(threading.Thread): 2 # def __init__(self): 3 # super().__init__() 4 # pass 5 # 6 # def run(self): 7 # pass 8 # 9 # c3 = ConsumeSpider() 10 # c3.start()
Association
Coroutine: a lightweight thread. There is no context switch. It can schedule multiple tasks among multiple tasks. It can be implemented by yield
1 import time, threading 2 3 def task_1(): 4 while True: 5 print('-----1-----', threading.current_thread()) 6 time.sleep(1) 7 yield 8 9 10 def task_2(): 11 while True: 12 print('-----2-----', threading.current_thread()) 13 time.sleep(1) 14 yield 15 16 17 def main(): 18 t1 = task_1() 19 t2 = task_2() 20 while True: 21 next(t1) 22 next(t2) 23 24 25 if __name__ == '__main__': 26 main() 27
Please use the mobile phone to "scan" x