python practices a simple crawler every day~

Goal: Climbing Articles Reading Network (Youth | Campus) to Read Novels

Attached website: http://xs.duwenzhang.com/list1/

The compiler used: pycharm

python3.0

Attach the code:

import os
import re
import urllib.request
from bs4 import BeautifulSoup
# Objective: To save all the youth campus novels on the novel website to local disk. (Store chapters of each chapter in the title of the novel as a catalogue)

def get_url_list(url):
    rq = urllib.request.urlopen(url)
    total_url = set()  #
    if rq.getcode() == 200:
        total_url.add(url)
        soup = BeautifulSoup(rq, 'html.parser', from_encoding='gbk')
        url_list = soup.find_all('a', href=re.compile(r'/list1/[0-9]+.html'))
        for url in url_list:
            total_url.add(url['href'])
        return total_url
    else:
        return 'Crawl failure'


def get_article_urlList(url_list):
    for article_url in url_list:
        rq = urllib.request.urlopen(article_url)
        if rq.getcode() == 200:
            soup = BeautifulSoup(rq, 'html.parser', from_encoding='gbk')
            page_url = soup.find_all('a', href=re.compile(r'/1/[0-9]+/'))
            page_url_list = set()
            for url in page_url:
                page_url_list.add('http://xs.duwenzhang.com' + url['href'])
            return page_url_list
        else:
            return 'Crawl failure'


def get_page_chapter_url(url_list):
    chaper_url = set()
    for url in url_list:
        rq = urllib.request.urlopen(url)
        if rq.getcode() == 200:
            soup = BeautifulSoup(rq, 'html.parser', from_encoding='gbk')
            chaper_list = soup.find_all('a', href=re.compile(r'/[0-9]+.html'))
            for chaper in chaper_list:
                chaper_url.add(chaper['href'])
        else:
            print('Crawl failure')
    return chaper_url


def get_chaper_page(url_list):
    for url in url_list:
        rq = urllib.request.urlopen(url)
        soup = BeautifulSoup(rq, 'html.parser', from_encoding='gbk')
        title = soup.find('a', href=re.compile('/1/[0-9]+/'))
        chaper_titile = soup.find(name='dt')  #
        chaper_content = soup.find(name='dd')  #
        if not title:
            print('page not present:%s' % url)
            continue
        else:
            title = str(title.get_text())
            chaper_titile = str(chaper_titile.get_text())
            chaper_content = str(chaper_content.get_text())
            dir_path = ('F:/Crawler file/Youth Campus Novels/' + str(title)).strip()
            if not os.path.exists(dir_path):
                os.mkdir(dir_path)
            strinfo = re.compile(r'(\t| |)')
            chaper_title = strinfo.sub('', chaper_titile)
            file_path = dir_path + '/' + chaper_title + '.txt'
            f = open(file_path, 'w+', encoding='utf-8')
            f.write(str(chaper_content))
            f.close()
    return 'Climb to success'


if __name__ == '__main__':
    root_url = 'http://xs.duwenzhang.com/list1/' root url, the front page of campus fiction
    Url_List = get_url_list(root_url)  # Entry function to get home url and other page URLs
    Page_List = get_article_urlList(Url_List)  # Get all the entry URLs for fiction on these pages
    Chaper_url = get_page_chapter_url(Page_List)  # Get the population url for all chapters
    print(get_chaper_page(Chaper_url))  # Implementing the required functions

Attached are the pictures of the implementation of the function: