Python crawls 1000 pages of Baidu Encyclopedia

### Crawler Architecture Dynamic Workflow

The scheduler asks the url manager if it has a url to crawl and returns it to the scheduler if it does. If so, the scheduler takes a url to crawl from the url manager and the url manager returns the url to the scheduler.
Now the scheduler gets the url. The scheduler transmits the URL to the downloader to download the content. After downloading, the URL content is returned to the scheduler.
The scheduler transmits the content of the url to the parser for page parsing. After parsing, it returns valuable data and a list of new urls.
On the one hand, the scheduler transfers value data to the application for data collection, on the other hand, it adds a new list of URLs to the url manager.
If the url manager has a new url to crawl, loop it.
When the url manager has no url to crawl, send its value data to the application.

#!/usr/bin/python
# coding=utf-8

#from baike_spider import url_manager,html_downloader,html_parser,html_outputer
import url_manager,html_downloader,html_parser,html_outputer

class SpiderMain(object):
    """docstring for SpiderMain"""
    def __init__(self):
        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.outputer = html_outputer.HtmlOutputer()

    def craw(self,root_url):
        count = 1
        self.urls.add_new_url(root_url)
        #If there is a url in the manager, download, parse, put the parsed url into the manager, and put useful content into the collector.
        while  self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()
                print 'craw %d : %s' % (count, new_url)
                html_cont = self.downloader.download(new_url)
                new_urls, new_data = self.parser.parser(new_url, html_cont)
                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(new_data)

                if count == 1000:
                    break
                count = count + 1
            except:
                print 'craw failed'
        self.outputer.output_html()

if __name__ == "__main__":
    root_url = "http://baike.baidu.com/view/21087.htm"
    obj_spider = SpiderMain()
    obj_spider.craw(root_url)

#!/usr/bin/python
# coding=utf-8

class UrlManager(object):
    """docstring for UrlManager"""
    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()

    def add_new_url(self, url):
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)
            
    def add_new_urls(self, urls):
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_new_url(url)

    def has_new_url(self):
        return len(self.new_urls) != 0

    def get_new_url(self):
        #pop, take one from the set and remove it
        new_url = self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url

#!/usr/bin/python
# coding=utf-8

import urllib2

class HtmlDownloader(object):
    """docstring for HtmlDownloader"""
    def __init__(self):
        pass

    def download(self, url):
        if url is None:
            return None

        response = urllib2.urlopen(url)
        #print response.getcode()

        if response.getcode() != 200:
            return None

        return response.read()

#!/usr/bin/python
# coding=utf-8

from bs4 import BeautifulSoup
import re
import urlparse

class HtmlParser(object):
    """docstring for HtmlParser"""
    def __init__(self):
        pass

    def parser(self, page_url, html_cont):
        #print page_url
        if page_url is None or html_cont is None:
            return

        soup = BeautifulSoup(html_cont, 'html.parser', from_encoding = 'utf-8')
        new_urls = self._get_new_urls(page_url, soup)
        new_data = self._get_new_data(page_url, soup)
        return new_urls, new_data

    def _get_new_urls(self, page_url, soup):
        # / view / Number. htm
        # such as /view/123.htm
        new_urls = set()
        links = soup.find_all('a', href = re.compile(r"/view/\d+\.htm"))
        for link in links:
            new_url = link['href']
            #print new_url
            #Install page_url format to complete new_url
            new_full_url = urlparse.urljoin(page_url, new_url)
            new_urls.add(new_full_url)
            #print new_full_url
        return new_urls

    def _get_new_data(self, page_url, soup):
        res_data = {}   #Dictionaries

        # url
        res_data['url'] = page_url
        #<dd class="lemmaWgt-lemmaTitle-title">
        #<h1>Python</h1>
        title_node = soup.find('dd',  class_= "lemmaWgt-lemmaTitle-title").find("h1")
        res_data['title'] = title_node.get_text()

        #<div class="lemma-summary" label-module="lemmaSummary">
        summary_node = soup.find('div', class_= "lemma-summary")
        res_data['summary'] = summary_node.get_text()

        return res_data

#!/usr/bin/python
# coding=utf-8

class HtmlOutputer(object):
    """docstring for HtmlOutputer"""
    def __init__(self):
        self.datas = []

    def collect_data(self, data):
        if data is None:
            return
        self.datas.append(data)

    def output_html(self):
        fout = open('output.html', 'w')

        fout.write("<html>")
        fout.write("<body>")
        fout.write("<table>")

        # python defaults ascii
        for data in self.datas:
            fout.write("<tr>")
            fout.write("<td>%s</td>" % data['url'])
            fout.write("<td>%s</td>" % data['title'].encode('utf-8'))
            fout.write("<td>%s</td>" % data['summary'].encode('utf-8'))
            fout.write("</tr>")

        fout.write("</table>")
        fout.write("</body>")
        fout.write("</html>")

        fout.close()

Posted by Frag on Thu, 03 Oct 2019 11:09:55 -0700

Programmer Group

Python crawls 1000 pages of Baidu Encyclopedia

Hot Keywords