Manual Creation of multiprocessing Multiprocess Distributed Crawler

Keywords: Programming encoding JSON NodeManager network

multiprocessing Multiprocess Crawling Knows the User

Crawl content screenshots

  • Open Control Node

  • Open the crawler node

  • Control node

ControlNode Control Node Part

NodeManger - Control Scheduler

#coding:utf-8
import time

from multiprocessing.managers import BaseManager
from multiprocessing import Process, Queue

from DataOutput import DataOutput
from MemberManager import MemberManager


class NodeManager(object):

    # Create a Distributed Manager
    def start_Manager(self,member_q,result_q):
        # The two queues created are register ed on the network, and the callable parameter is used to associate the Queue object.
        # Exposing Queue Objects in the Network

        BaseManager.register('get_task_queue',callable=lambda:member_q)
        BaseManager.register('get_result_queue',callable=lambda:result_q)

        manager = BaseManager(address=('', 8001), authkey='zhihuuser'.encode('utf-8'))
        return manager

    # Process 1: member management
    def member_manager_proc(self,member_q,con_q,root_member):
        member_manager = MemberManager()
        member_manager.add_new_member(root_member)
        while True:
            while(member_manager.has_new_member()):
                new_member = member_manager.get_new_member()  #Getting a new member from the URL Manager
                member_q.put(new_member)  #Send the new URL to the working node
                print('old_member=',member_manager.old_member_size())

                if(member_manager.old_member_size()>2000):
                    member_q.put('end')
                    print('Control Node Initiates End Notification!')
                    #Close the management node while storing the set state
                    member_manager.save_progress('new_members.txt',member_manager.new_members)
                    member_manager.save_progress('old_members.txt',member_manager.old_members)
                    return
            #Add members obtained from result_solve_proc to the URL Manager
            try:
                members = con_q.get()
                member_manager.add_new_members(members)
            except BaseException as e:
                time.sleep(0.1)

    # Process 2: Data extraction
    def result_solve_proc(self,result_q,con_q,store_q):
        while(True):
            try:
                if not result_q.empty():
                    #Queue.get(block=True, timeout=None)
                    content = result_q.get(True)
                    if content['new_members']=='end':
                        print('The result analysis process receives notification and then ends!')
                        store_q.put('end')
                        return
                    con_q.put(content['new_members']) # member is set type
                    store_q.put(content['data']) # The parsed data is dict type
                else:
                    time.sleep(0.1)
            except BaseException as e:
                time.sleep(0.1)

    # Process 3: Data storage
    def store_proc(self,store_q):
        output = DataOutput()
        while True:
            if not store_q.empty():
                data = store_q.get()
                if data=='end':
                    print('Storage process receives notification and ends!')
                    output.ouput_end(output.filepath)

                    return
                output.store_data(data)
            else:
                time.sleep(0.1)
        pass


if __name__=='__main__':

    member_q = Queue()
    result_q = Queue()
    store_q = Queue()
    con_q = Queue()

    #Create a Distributed Manager
    node = NodeManager()
    manager = node.start_Manager(member_q,result_q)

    #Create URL management processes, data extraction processes, and data storage processes
    member_manager_proc = Process(target=node.member_manager_proc, args=(member_q,con_q,'excited-vczh',))
    result_solve_proc = Process(target=node.result_solve_proc, args=(result_q,con_q,store_q,))
    store_proc = Process(target=node.store_proc, args=(store_q,))

    #Start three processes and distributed managers
    member_manager_proc.start()
    result_solve_proc.start()
    store_proc.start()
    manager.get_server().serve_forever()


MemberManger - Knowing User Manager

#coding:utf-8
import pickle
import hashlib

class MemberManager(object):
    def __init__(self):
        self.new_members = self.load_progress('new_members.txt')#Uncrawled URL collection
        self.old_members = self.load_progress('old_members.txt')#Crawled URL collection

    # Loading progress from local files
    def load_progress(self,path):
        print('[+] Loading progress from files: %s' % path)
        try:
            with open(path, 'rb') as f:
                tmp = pickle.load(f)
                return tmp
        except:
            print('[!] No progress document, Establish: %s' % path)
        return set()
    # Preservation progress
    def save_progress(self,path,data):
        with open(path, 'wb') as f:
            pickle.dump(data, f)

    # Determine whether there is a member that has not been crawled
    def has_new_member(self):
        return self.new_member_size()!=0

    # Get a member that is not crawled
    def get_new_member(self):
        new_member = self.new_members.pop()
        m = hashlib.md5()
        m.update(new_member.encode("utf-8"))
        self.old_members.add(m.hexdigest()[8:-8])
        return new_member

    def add_new_member(self,member):
        if member is None:
            return
        m = hashlib.md5()
        m.update(member.encode('utf-8'))
        member_md5 =  m.hexdigest()[8:-8]
        if member not in self.new_members and member_md5 not in self.old_members:
            self.new_members.add(member)

    def add_new_members(self,members):
        if members is None or len(members)==0:
            return
        for member in members:
            self.add_new_member(member)

    def new_member_size(self):
        return len(self.new_members)

    def old_member_size(self):
        return len(self.old_members)

Data Output - Data Storage

#coding:utf-8
import codecs
import time

class DataOutput(object):
    def __init__(self):
        self.filepath='zhihu_%s.html'%(time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) )
        self.output_head(self.filepath)
        self.datas=[]

    def store_data(self,data):
        if data is None:
            return
        self.datas.append(data)
        if len(self.datas)>10:
            self.output_html(self.filepath)

    # Write HTML headers in
    def output_head(self,path):
        fout=codecs.open(path,'w',encoding='utf-8')
        fout.write("<html>")
        fout.write(r'''<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />''')
        fout.write("<body>")
        fout.write("<table>")
        fout.close()

    # Write data into HTML files
    def output_html(self,path):
        fout=codecs.open(path,'a',encoding='utf-8')
        for data in self.datas:
            fout.write("<tr>")
            fout.write("<td>%s</td>" % data['name'])
            fout.write("<td>%s</td>" % data['member'])
            fout.write("<td>%s</td>" % data['id'])
            fout.write("<td>%s</td>" % data['gender'])
            fout.write("<td>%s</td>" % data['type'])
            fout.write("<td>%s</td>" % data['headline'])
            fout.write("<td>%s</td>" % data['url'])
            fout.write("<td>%s</td>" % data['answer_count'])
            fout.write("<td>%s</td>" % data['articles_count'])
            fout.write("<td>%s</td>" % data['follower_count'])
            fout.write("<td>%s</td>" % data['badge'])
            fout.write("<td>%s</td>" % data['employments'])
            fout.write("</tr>")
        self.datas=[]
        fout.close()

    def ouput_end(self,path):
        fout=codecs.open(path,'a',encoding='utf-8')
        fout.write("</table>")
        fout.write("</body>")
        fout.write("</html>")
        fout.close()

SpiderNode crawler node section

SpiderWorker-Crawler Scheduler

#coding:utf-8
from multiprocessing.managers import BaseManager

from Downloader import HtmlDownloader
from Parser import HtmlParser


class SpiderWork(object):
    # Initialization of Joint Work of Work Nodes in Distributed Processes
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')

        server_addr = '127.0.0.1'
        print(('Connect to server %s...' % server_addr))
        self.m = BaseManager(address=(server_addr, 8001), authkey='zhihuuser'.encode('utf-8'))
        self.m.connect()

        # Get Queue objects:
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()

        # Initialize Web Downloader and Parser
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        while(True):
            try:
                if not self.task.empty():
                    member = self.task.get()

                    if member =='end':
                        print('The control node notifies the crawler node to stop working...')
                        # The other nodes are then notified to stop working.
                        self.result.put({'new_members':'end','data':'end'})
                        return
                    print('The crawler node is parsing:%s'%member.encode('utf-8'))
                    f_html, m_html = self.downloader.download(member)
                    new_members,data = self.parser.parse(f_html, m_html)
                    self.result.put({"new_members":new_members,"data":data})
            except EOFError as e:
                print("Failure to connect working nodes")
                return
            except Exception as e:
                print(e)
                print('Crawl fail ')




if __name__=="__main__":
    spider = SpiderWork()
    spider.crawl()

Downloader - HTML Downloader

#coding:utf-8
import requests
from requests.exceptions import ConnectionError

class HtmlDownloader(object):

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}

    folowees_url = 'https://www.zhihu.com/api/v4/members/{member}/followees'
    member_url = 'https://www.zhihu.com/api/v4/members/{member}'

    follwees_query = {
        'include': 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics',
        'offset': None,
        'limit': 20
    }
    member_query = {
        'include': 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
    }


    def download(self,member):
        if member is None:
            return None
        followees_html = self.download_followees(member)
        member_html = self.download_member(member)
        return followees_html,member_html



    def download_followees(self,member):
        r = requests.get(self.folowees_url.format(member=member), data=self.follwees_query, headers=self.headers)
        if r.status_code==200:
            r.encoding='utf-8'
            followees_html = r.text
            return followees_html
        if r.status_code==403:
            return self.download_followees(member)



    def download_member(self,member):
        offset = 0
        self.follwees_query['offset'] = offset
        offset += 20
        r = requests.get(self.member_url.format(member=member), data=self.member_query,headers=self.headers)
        if r.status_code==200:
            r.encoding='utf-8'
            followees_html = r.text
            return followees_html
        if r.status_code == 403:
            return self.download_followees(member)



Parser - HTML parser

#coding:utf-8
import urllib.parse
import json
import requests

class HtmlParser(object):

    # Used to parse web content, extract url and data
    def parse(self, followees_html, member_html):
        if followees_html is None or member_html is None:
            return
        new_members = self.parse_followees(followees_html)
        new_data = self.parse_member(member_html)
        return new_members,new_data

    # Extracting new url sets
    def parse_followees(self, html):
        if html is None:
            return
        result = json.loads(html)
        new_members = []
        if 'data' in result.keys():
            for data in result.get('data'):
                print(data.get('url_token'))
                new_member = data.get('url_token')
                print(new_member)
                new_members.append(new_member)

        return new_members

    # Extracting valid data
    def parse_member(self,html):
        if html is None:
            return

        result = json.loads(html)
        data={}

        data['member']= result.get('url_token')
        data['id']=result.get('id')
        data['name']= result.get('name')
        data['headline']= result.get('headline')
        data['url']= result.get('url')
        data['gender']=result.get('gender')
        data['type']=result.get('type')
        data['badge']=result.get('badge')
        data['answer_count']=result.get('answer_count')
        data['articles_count']=result.get('articles_count')
        data['follower_count']=result.get('follower_count')
        data['employments']=result.get('employments')

        return data

Posted by mdmann on Wed, 30 Jan 2019 02:48:16 -0800