The author recently learned to crawl, and practiced with Douban Movie. But Douban Movie has anti-crawling mechanism. After climbing 250, I will redirect and ask me to do the landing operation. So I only crawl the first 50 for related tests this time. Nonsense, let's look at the source code.
This time we used requests library, Beautiful Soup parsing library, and re auxiliary regular matching library. Finally, we used pandas DataFrame to write excel as usual.
import requests from bs4 import BeautifulSoup import re import pandas headers = { 'Host':'movie.douban.com', 'Origin':'movie.douban.com', 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Mobile Safari/537.36', } base_url = 'https://movie.douban.com/top250?start={}&filter=' response = requests.get('https://movie.douban.com/top250?start=0&filter=', headers = headers) if response.status_code == 200: # print(response.text) pass pattern1 = re.compile('<div.*?class="item">.*?<div.*?class="pic">.*?<a.*?href="(.*?)">', re.S) # Remove all line breaks and use regular expressions to match specific movies on each page urls = re.findall(pattern1, response.text) directors = [] # director names = [] # Movie name stars = [] # To star countrys = [] # The Origin of Films languages = [] # Movie language headers_urls = { 'Host':'movie.douban.com', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } # <span property="v:itemreviewed">Shawshank Redemption</span> # <a href="/celebrity/1047973/"rel="v:directedBy">Frank Drabant</a> # <a href="/celebrity/1054521/"rel="v:starring">Tim Robbins</a> def base_urls(base_url): urls = [] # Here we can only test on the first two pages, so the range is only set to 50. # for i in range(0, 275, 25): # true_url = base_url.format(i) # print(true_url) for i in range(0, 50, 25): true_url = base_url.format(i) print(true_url) response = requests.get(true_url, headers=headers) if response.status_code == 200: # print(response.text) pattern1 = re.compile('<div.*?class="item">.*?<div.*?class="pic">.*?<a.*?href="(.*?)">',re.S) # Remove all line breaks and use regular expressions to match specific movies on each page url = re.findall(pattern1, response.text) # Because this is findall, he returns a list. If we append directly, it will result in nesting of lists. So we use a for loop to extract the elements of the list and append them in. for i in url: urls.append(i) return urls def parse_url(urls): # Because only the first two pages were tested, the range was set to 50. for i in range(0, 50, 1): res = requests.get(urls[i], headers = headers_urls) print(res) if res.status_code == 200: soup = BeautifulSoup(res.text, 'lxml') # Crawl for the title of the movie name = (soup.find('span', property="v:itemreviewed")) names.append(name.text) # print(names) # Crawling director director = soup.find('a', rel="v:directedBy") directors.append(director.text) # print(director.text) # Climb stars star_save = [] for star in soup.find_all('a', rel="v:starring"): star_save.append(star.text) stars.append(star_save) # print(stars) # Climbing the country of production #<span class="pl">producer country/region: </span>United States <br> # Learning Points: Finding the Next Brother Node by Matching Text Content country = soup.find('span', text='Producer country/region:').next_sibling[1:] countrys.append(country) # print(countrys) # Crawling Film Language # <span class="pl"> language:</span> language = soup.find('span', text='language:').next_sibling[1:] languages.append(language) # print(language) # print(directors) # print(true_director) # print(a) if __name__ == '__main__': base = base_urls(base_url) print(base) print(len(base)) parse_url(base) print(countrys) print(directors) print(languages) print(names) # # Finally, we write the data into an excel table info ={'Filmname':names, 'Directors':directors, 'Country':countrys, 'Languages':languages} pdfile = pandas.DataFrame(info) # pdlook.to_excel('Chain Home. xlsx', sheet_name= "Chain Home Second-hand Housing Guangzhou") pdfile.to_excel('DoubanFilm.xlsx', sheet_name="Watercress movie")