120 lines of code to crawl the Douban Film top250

Keywords: Excel Linux Android nexus

The author recently learned to crawl, and practiced with Douban Movie. But Douban Movie has anti-crawling mechanism. After climbing 250, I will redirect and ask me to do the landing operation. So I only crawl the first 50 for related tests this time. Nonsense, let's look at the source code.
This time we used requests library, Beautiful Soup parsing library, and re auxiliary regular matching library. Finally, we used pandas DataFrame to write excel as usual.

import requests
from bs4 import BeautifulSoup
import re
import pandas

headers = {
    'Host':'movie.douban.com',
    'Origin':'movie.douban.com',
    'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Mobile Safari/537.36',
}
base_url = 'https://movie.douban.com/top250?start={}&filter='





response = requests.get('https://movie.douban.com/top250?start=0&filter=', headers = headers)
if response.status_code == 200:
    # print(response.text)
    pass

pattern1 = re.compile('<div.*?class="item">.*?<div.*?class="pic">.*?<a.*?href="(.*?)">', re.S) # Remove all line breaks and use regular expressions to match specific movies on each page
urls = re.findall(pattern1, response.text)

directors = [] # director

names = [] # Movie name

stars = [] # To star

countrys = [] # The Origin of Films

languages = [] # Movie language


headers_urls = {
    'Host':'movie.douban.com',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}

# <span property="v:itemreviewed">Shawshank Redemption</span>
# <a href="/celebrity/1047973/"rel="v:directedBy">Frank Drabant</a>
# <a href="/celebrity/1054521/"rel="v:starring">Tim Robbins</a>
def base_urls(base_url):
    urls = []
    # Here we can only test on the first two pages, so the range is only set to 50.
    # for i in range(0, 275, 25):
    #     true_url = base_url.format(i)
    #     print(true_url)
    for i in range(0, 50, 25):
        true_url = base_url.format(i)
        print(true_url)

        response = requests.get(true_url, headers=headers)
        if response.status_code == 200:
            # print(response.text)

            pattern1 = re.compile('<div.*?class="item">.*?<div.*?class="pic">.*?<a.*?href="(.*?)">',re.S)
            # Remove all line breaks and use regular expressions to match specific movies on each page
            url = re.findall(pattern1, response.text)
            # Because this is findall, he returns a list. If we append directly, it will result in nesting of lists. So we use a for loop to extract the elements of the list and append them in.

            for i in url:
                urls.append(i)


    return urls

def parse_url(urls):
    # Because only the first two pages were tested, the range was set to 50.
    for i in range(0, 50, 1):
        res = requests.get(urls[i], headers = headers_urls)
        print(res)
        if res.status_code == 200:
            soup = BeautifulSoup(res.text, 'lxml')
            # Crawl for the title of the movie
            name = (soup.find('span', property="v:itemreviewed"))
            names.append(name.text)
            # print(names)

            # Crawling director
            director = soup.find('a', rel="v:directedBy")
            directors.append(director.text)
            # print(director.text)

            # Climb stars
            star_save = []
            for star in soup.find_all('a', rel="v:starring"):
                star_save.append(star.text)
                stars.append(star_save)
            # print(stars)


            # Climbing the country of production
            #<span class="pl">producer country/region: </span>United States <br>
            # Learning Points: Finding the Next Brother Node by Matching Text Content
            country = soup.find('span', text='Producer country/region:').next_sibling[1:]
            countrys.append(country)
            # print(countrys)


            # Crawling Film Language
            # <span class="pl"> language:</span>
            language = soup.find('span', text='language:').next_sibling[1:]
            languages.append(language)
            # print(language)


# print(directors)
# print(true_director)
# print(a)
if __name__ == '__main__':
    base = base_urls(base_url)
    print(base)
    print(len(base))
    parse_url(base)
    print(countrys)
    print(directors)
    print(languages)
    print(names)
    #
    # Finally, we write the data into an excel table
    info ={'Filmname':names, 'Directors':directors, 'Country':countrys, 'Languages':languages}
    pdfile = pandas.DataFrame(info)
    # pdlook.to_excel('Chain Home. xlsx', sheet_name= "Chain Home Second-hand Housing Guangzhou")
    pdfile.to_excel('DoubanFilm.xlsx', sheet_name="Watercress movie")

Posted by dirkadirka on Thu, 11 Apr 2019 00:09:33 -0700