Self-taught python for more than two months, accidentally got 14 videos of automated development of old boys, such as gem. I like Alex's chicken soup and Wu Sir's seriousness. I haven't finished reading it yet, and I have gained a lot.
Today I want to use my knowledge to crawl all the latest movie information in the movie paradise.
requests: Used to obtain web page information
re: Get the specific information you want on the web page
Beautifulsoup: Easy to search for tags and get the information you want
threading: Using multithreading dramatically reduces crawling time
queue: Use thread queues to write files (I actually put all the data in the database)
About these modules.
Welcome to your advice. (Think of Alex, or Wu Sir, as he sees it, making some suggestions)
# Author : 'n1celll' import requests import json import re from bs4 import BeautifulSoup import threading import queue,time header = header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} url = 'http://www.ygdy8.net/html/gndy/dyzz/index.html' def get_page(url): index = resp = requests.get(url, headers=header) index.enconding = 'GBK'# Convert the encoding to HTML conformity t = index.text index_soup = BeautifulSoup(t, 'html.parser')# Converting the acquired web page information into a soup object all_pages = index_soup.find('select', attrs={'name': 'sldd'}).find_all('option')[-1] # Get the total number of pages page = int(all_pages.string) return page def get_data(page): page_url = 'http://Www.ygdy8.net/html/gndy/dyzz/list_23_% s.html'%(page) Get data for each page print(page) # page_url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_30.html' res = requests.get(page_url, headers=header) res.encoding = 'GBK' # 'gb2312' a = res.text soup = BeautifulSoup(a, 'html.parser') name = soup.find_all('a', attrs={'class': 'ulink'}) # print(name) test for i in name: try: moive_name = re.search('<(.*?)(>|])', i.string).group() # There are two pits, one with a book title that the movie name is not used, and the other with two a tags. except: continue html = 'http://www.ygdy8.net' + i['href'] da = requests.get(html, headers=header) da.encoding = 'GBK' # da.apparent_encoding db = da.text # f = open('test2.txt','w',encoding='utf8') # f.write(a.text) # f.close() dr = BeautifulSoup(db, 'html.parser') span = dr.find('span', attrs={'style': 'FONT-SIZE: 12px'}) if span: dc = span.text.split() data = '' for i in dc: data += i print(data) msg = {} if data: msg['mname'] = moive_name try: show_t = re.search(r'(?<=(◎years|◎time|Age of products|Era]|Sowing time|Sowing:))(.*?)(?=◎|year|[)', data).group() except: show_t = re.search(r'(?<=date|Part:)(.*?)(?=(-|drama))', data).group() msg['mtime'] = show_t try: country = re.search(r'(?<=(◎Country|◎Place of Origin|◎region|◎different countries|Country]))(.*?)(?=◎|[class)', data).group() except: try: country = re.search(r'(?<=region)(.*?)(?=language)', data).group() except: country = 'Unknown' msg['mcountry'] = country try: time = re.search(r'(?<=◎Film length|Length])(.*?)(?=◎|[)', data).group() except: time = 'Unknown' msg['mtime'] = time try: mtype = re.search(\ r'(?<=(◎category|Other types|Shadow type|◎type|Set type|◎classification|Type:|Category]|Slice type|Type:))(.*?)(?=(◎|level|[Produce|[To star))', \ data).group() except: try: mtype = re.search(r'(?<=type:)(.*?)(?=country)', data).group() except: mtype = re.search(r'action|Love|Warfare', data).group() #The above regular expressions feel awkward. I hope you have some technical advice. # with open('test4.txt','a+',encoding='utf8') as f: Test # F.write ('% s:% s,% s,% s,% s,% s,% s n'% (moive_name, country, mtype, time, show_t)) test q.put('%s: %s,%s,%s,%s,%s\n' % (moive_name, country, mtype, time, show_t,html)) q = queue.Queue(maxsize=10000) t_obj = [] lock = threading.Lock()#Add thread locks # semaphore = threading.BoundedSemaphore(200) def writing(f): # semaphore.acquire() data = q.get() lock.acquire() f.write(data) lock.release() # semaphore.release() # if not q.get(): # f.close() # break # print('write complete') all_page = get_page(url) f = open('test4.txt', 'w', encoding='utf8') print(all_page+1) for i in range(1,all_page+1): t = threading.Thread(target=get_data,args=(i,)) t.start() t_obj.append(t) for t in t_obj: t.join()#Ensure that all threads start writing after they have finished print('%s over'%t) while q.qsize():#Determine if there are elements in the queue w = threading.Thread(target=writing, args=(f,)) w.start() w.join() else: print('Write completion')