Master slave distributed crawler

Why use distributed Crawlers

It's been a while since learning crawlers were implemented in a python file, without considering performance, efficiency, etc. So as a qualified spider, we need to learn about distributed crawlers.
What distributed crawler? In short, it is to use multiple servers to get data, let these servers cooperate and assign their own tasks.

Distributed crawler design

One of the most commonly used is the master-slave distributed crawler. In this paper, Redis server will be used as the task queue.
As shown in the picture:

Preparation

Install python3 and Redis
Install requests and Redis related libraries

pip install requests
pip install pyquery
pip install redis

Code

Main function (main.py)

import os
import requests
from pyquery import PyQuery as pq
import re
import json

import config
from cache import RedisCache
from model import Task


def parse_link(div):
    '''
    //Get connection
    '''
    e = pq(div)
    href = e.find('a').attr('href')
    return href

def get_from_url(url):
    '''
    //Get list connection
    '''
    page = get_page(url)
    e = pq(page)
    items = e('.epiItem.video')
    links = [parse_link(i) for i in items]
    print(len(links))
    links.reverse()
    return links

def get_page(url):
    '''
    //Get pages
    '''
    proxies = config.proxies
    try:
        res = requests.get(url,proxies=proxies)
        # print(res.text)
    except requests.exceptions.ConnectionError as e:
        print('Error',e.args)
    page = res.content
    return page

def get_all_file(path, fileList=[]):
    '''
    //Get all files in the directory
    '''
    get_dir = os.listdir(path)  #Traverse the current directory to get the file list
    for i in get_dir:
        sub_dir = os.path.join(path,i)  # Add the file obtained in the first step to the path
        # print(sub_dir)
        if os.path.isdir(sub_dir):     #If it is still a folder, call recursively
            get_all_file(sub_dir, fileList)
        else:
            ax = os.path.abspath(sub_dir)  #If the current path is not a folder, put the filename in the list
            # print(ax)
            fileList.append(ax)
    return fileList

def init_finish_task(path):
    '''
    //Initialize completed tasks
    '''
    redis_cache = RedisCache()
    fileList = []
    fileList = get_all_file(path, fileList)
    # print(fileList)
    for file in fileList:
        file_name = os.path.basename(file)
        task_id = file_name[:5]
        # print(task_id)
        redis_cache.sadd('Task:finish', task_id)
    print('init_finish_task...end')

def init_task_url():
    '''
    //Initialize finished task url
    '''
    redis_cache = RedisCache()
    url = config.list_url
    link_list = get_from_url(url)
    for link in link_list:
        task_url = config.task_url_head+link
        # print(task_url)
        task_id = task_url[-5:]
        # redis_cache.set('Task:id:{}:url'.format(task_id),task_url)
        t = task_from_url(task_url)
        # print('add task {}'.format(t.__dict__))
        print('add task_id {}'.format(task_id))
        redis_cache.set('Task:id:{}'.format(task_id), t.__dict__)
        # print(t)
    print('init_task_url...end')  

def task_from_url(task_url):
    '''
    //Getting tasks
    '''
    page = get_page(task_url)
    e = pq(page)

    task_id = task_url[-5:]
    title = e('.controlBar').find('.epi-title').text().replace('/', '-').replace(':',': ')
    file_url = e('.audioplayer').find('audio').attr('src')
    ext = file_url[-4:]
    file_name = task_id+'.'+title+ext
    # content = e('.epi-description').html()

    t = Task()
    t.id = task_id
    t.title = title
    t.url = task_url
    t.file_name = file_name
    t.file_url =  file_url
    # t.content =  content
    return t

def main():
    init_task_url()
    init_finish_task(config.down_folder)

if __name__ == '__main__':
    main()

Slave function (salver.py)

import os
import requests
from pyquery import PyQuery as pq
import re
import json

import config
from cache import RedisCache


def get_page(url):
    '''
    //Get pages
    '''
    proxies = config.proxies
    try:
        res = requests.get(url,proxies=proxies)
        # print(res.text)
    except requests.exceptions.ConnectionError as e:
        print('Error',e.args)
    page = res.content
    return page


def begin_task(task_id, file_name, file_url):
    print('begin task {}'.format(task_id))
    redis_cache = RedisCache()
    # Add to downloading list
    redis_cache.sadd('Task:begin', task_id)
    print('download...{}'.format(file_name))
    folder = config.down_folder
    path = os.path.join(folder, file_name)
    download(file_url, path)
    print('end task {}'.format(task_id))


def download(link, path):
    redis_cache = RedisCache()
    proxies = config.proxies
    if os.path.exists(path):
        print('file exist')
    else:
        try:
            r = requests.get(link,proxies=proxies,stream=True)
            total_length = int(r.headers['Content-Length'])
            with open(path, "wb") as code:
                code.write(r.content)
                # Is the document complete
                length = os.path.getsize(path)
                print('length={}'.format(length))
                print('total_length={}'.format(total_length))
                if  total_length != length:
                    # Delete old files
                    os.remove(path)
                    # Re Download
                    download(path, link)
                else:
                    print('download success')
                    # Add to downloaded
                    file_name = os.path.basename(path)
                    content_id = file_name[:5]
                    redis_cache.srem('Task:begin', content_id)
                    redis_cache.sadd('Task:finish', content_id)
            # print(r.text)
        except requests.exceptions.ConnectionError as e:
            print('Error',e.args)


def main():
    redis_cache = RedisCache()
    #  Check if the task list is in the downloaded list
    keys = redis_cache.keys('Task:id:[0-9]*')
    # print(keys)
    new_key = [key.decode() for key in keys]
    # print(new_key)
    # Sort by id
    new_key = sorted(new_key,key = lambda i:int(i.split(':')[2]))
    # print(new_key)
    for key in new_key:

        
        task_id = key.split(':')[2]
        # print(task_id)
        is_finish = redis_cache.sismember('Task:finish', task_id)
        is_begin = redis_cache.sismember('Task:begin', task_id)
        if is_finish==1:
            print('Task {} is finish'.format(task_id))
        elif is_begin==1:
            print('Task {} is begin'.format(task_id))
        else:
            file_name = json.loads(redis_cache.get(key).decode('utf-8').replace("\'", "\""))['file_name']
            file_url = json.loads(redis_cache.get(key).decode('utf-8').replace("\'", "\""))['file_url']
            # print(file_url)
            begin_task(task_id, file_name, file_url)

if __name__ == '__main__':
    main()

Full code address

https://github.com/zhourunliang/master-slave-crawler

Posted by crash58 on Sat, 30 Nov 2019 07:18:16 -0800

Programmer Group