Python Rapid Development of Scrapy, a Distributed Search Engine

Keywords: Python Attribute

Write spiders crawler file to capture content in a loop

Request() method, which adds the specified url address to the download page of the downloader, with two necessary parameters.
Parameters:
  url='url'
callback = page handler
yield Request()

parse.urljoin() method is a method under urllib library. It is automatic url splicing. If the url address of the second parameter is relative path, it will automatically splice with the first parameter.

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request                             #Method of importing url back to Downloader
from urllib import parse                                    #Import parse module in urllib Library

class PachSpider(scrapy.Spider):
    name = 'pach'
    allowed_domains = ['blog.jobbole.com']                  #Initial domain name
    start_urls = ['http://blog.jobbole.com/all-posts/'] start url

    def parse(self, response):
        """
        //Get the url address of the list page and hand it to the downloader
        """
        #Get the current page article url
        lb_url = response.xpath('//A [@class= "archive-title"]/@href'. extract ()# Gets the list url of articles
        for i in lb_url:
            # print(parse.urljoin(response.url,i))                                             #The urljoin() method of parse module in urllib library is automatic url splicing. If the url address of the second parameter is relative path, it will automatically splice with the first parameter.
            yield Request(url=parse.urljoin(response.url, i), callback=self.parse_wzhang)      #Add the circular article url to the downloader and hand it to the parse_wzhang callback function after downloading.

        #Get the next page list url, hand it to the downloader, and return it to the parse function loop
        x_lb_url = response.xpath('//A [@class= "next page-numbers"]/@href'. extract ()# Gets the next page article list url
        if x_lb_url:
            yield Request(url=parse.urljoin(response.url, x_lb_url[0]), callback=self.parse)     #Get the url on the next page and return it to the downloader. Callback to the parse function and loop through it.

    def parse_wzhang(self,response):
        title = response.xpath('//Div [@class= "entry-header"]/h1/text ()'. extract ()# Gets the title of the article
        print(title)

Python Resource Sharing Skirt: 855408893 has installation packages, learn video materials, update technology every day. Here is the gathering place of Python learners, zero foundation, advanced, welcome to click Python resource sharing

When the Request() function returns the url, it can also return a custom dictionary to the callback function through the meta attribute.

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request                             #Method of importing url back to Downloader
from urllib import parse                                    #Import parse module in urllib Library
from adc.items import AdcItem                               #Importing the Receiving Class of the items Data Receiving Module

class PachSpider(scrapy.Spider):
    name = 'pach'
    allowed_domains = ['blog.jobbole.com']                  #Initial domain name
    start_urls = ['http://blog.jobbole.com/all-posts/'] start url

    def parse(self, response):
        """
        //Get the url address of the list page and hand it to the downloader
        """
        #Get the current page article url
        lb = response.css('div .post.floated-thumb')  #Get the article list block, css selector
        # print(lb)
        for i in lb:
            lb_url = i.css('.archive-title ::attr(href)').extract_first('')     #Get the article url in the block
            # print(lb_url)

            lb_img = i.css('.post-thumb img ::attr(src)').extract_first('')     #Get article thumbnails in blocks
            # print(lb_img)

            yield Request(url=parse.urljoin(response.url, lb_url), meta={'lb_img':parse.urljoin(response.url, lb_img)}, callback=self.parse_wzhang)      #Add the circular article url to the downloader and hand it to the parse_wzhang callback function after downloading.

        #Get the next page list url, hand it to the downloader, and return it to the parse function loop
        x_lb_url = response.css('.next.page-numbers ::attr(href)').extract_first('')         #Get the next page article list url
        if x_lb_url:
            yield Request(url=parse.urljoin(response.url, x_lb_url), callback=self.parse)     #Get the url on the next page and return it to the downloader. Callback to the parse function and loop through it.

    def parse_wzhang(self,response):
        title = response.css('.entry-header h1 ::text').extract()           #Get the title of the article
        # print(title)

        tp_img = response.meta.get('lb_img', '')                            #Receive meta-passed values and use get to prevent errors
        # print(tp_img)

        shjjsh = AdcItem()                                                                   #Instantiated Data Receiving Class
        shjjsh['title'] = title                                                              #Data transmission to the specified class of items receiving module
        shjjsh['img'] = tp_img

        yield shjjsh                                #Return the receiving object to pipelines.py processing module

Scrapy's built-in Image Downloader uses

Scrapy gives us a built-in image downloader, crapy. pipelines. images. Images Pipeline, which is designed to download images locally after the crawler grabs the url of the image.

First step, after the crawler grabs the image URL address, it fills in the container function of items.py file.

Crawler files

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request                             #Method of importing url back to Downloader
from urllib import parse                                    #Import parse module in urllib Library
from adc.items import AdcItem                               #Importing the Receiving Class of the items Data Receiving Module

class PachSpider(scrapy.Spider):
    name = 'pach'
    allowed_domains = ['blog.jobbole.com']                  #Initial domain name
    start_urls = ['http://blog.jobbole.com/all-posts/'] start url

    def parse(self, response):
        """
        //Get the url address of the list page and hand it to the downloader
        """
        #Get the current page article url
        lb = response.css('div .post.floated-thumb')  #Get the article list block, css selector
        # print(lb)
        for i in lb:
            lb_url = i.css('.archive-title ::attr(href)').extract_first('')     #Get the article url in the block
            # print(lb_url)

            lb_img = i.css('.post-thumb img ::attr(src)').extract_first('')     #Get article thumbnails in blocks
            # print(lb_img)

            yield Request(url=parse.urljoin(response.url, lb_url), meta={'lb_img':parse.urljoin(response.url, lb_img)}, callback=self.parse_wzhang)      #Add the circular article url to the downloader and hand it to the parse_wzhang callback function after downloading.

        #Get the next page list url, hand it to the downloader, and return it to the parse function loop
        x_lb_url = response.css('.next.page-numbers ::attr(href)').extract_first('')         #Get the next page article list url
        if x_lb_url:
            yield Request(url=parse.urljoin(response.url, x_lb_url), callback=self.parse)     #Get the url on the next page and return it to the downloader. Callback to the parse function and loop through it.

    def parse_wzhang(self,response):
        title = response.css('.entry-header h1 ::text').extract()           #Get the title of the article
        # print(title)

        tp_img = response.meta.get('lb_img', '')                            #Receive meta-passed values and use get to prevent errors
        # print(tp_img)

        shjjsh = AdcItem()                                                                   #Instantiated Data Receiving Class
        shjjsh['title'] = title                                                              #Data transmission to the specified class of items receiving module
        shjjsh['img'] = [tp_img]

        yield shjjsh                                #Return the receiving object to pipelines.py processing module

Step 2: Setting up items.py Container function of file to receive data filled by crawler

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

#Item. py, the file is specially used to receive data information from the crawler, which is equivalent to container file.

class AdcItem(scrapy.Item):    #Setting the information container class that the crawler gets
    title = scrapy.Field()     #Receiving title information from Crawlers
    img = scrapy.Field()       #Receive thumbnails
    img_tplj = scrapy.Field()  #Picture save path

Step 3: Using crapy's built-in Image Downloader in pipelines.py

1. Introduce the built-in Image Downloader first

2. Customize an image download, inherit crapy's built-in ImagesPipeline Image Downloader class

3. Use the item_completed() method in the ImagesPipeline class to get the saved path of the image after downloading.

4. In the settings.py settings file, register the Custom Image Downloader class and set the path to save the image.

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline  #Import Picture Downloader Module

class AdcPipeline(object):                      #To define data processing classes, you must inherit object s
    def process_item(self, item, spider):       #process_item(item) is a data processing function that receives an item, which is the data object from the last yield item of the crawler.
        print('The title of the article is:' + item['title'][0])
        print('Post Thumbnails url Yes,' + item['img'][0])
        print('The path to save the article thumbnails is:' + item['img_tplj'])  #Receive the path filled by the Image Downloader after the image download

        return item

class imgPipeline(ImagesPipeline):                      #Customize an image download, inherit crapy's built-in ImagesPipeline Image Downloader class
    def item_completed(self, results, item, info):      #Use the item_completed() method in the ImagesPipeline class to get the saved path of the downloaded image
        for ok, value in results:
            img_lj = value['path']     #Receiving Picture Save Path
            # print(ok)
            item['img_tplj'] = img_lj  #Fill the image save path into the fields in items.py
        return item                    #Container functions that give item to items.py files

    #Note: After setting up the custom image downloader, you need to

In the settings.py settings file, register the Custom Image Downloader class and set the image save path

IMAGES_URLS_FIELD sets the url address of the image to be downloaded and the fields received in items.py
IMAGES_STORE Sets Picture Save Path

What I don't know in the process of learning can be added to me?
python learning resource qun, 855 408 893
 There are good learning video tutorials, development tools and e-books in the group.
Share with you the current talent needs of python enterprises and how to learn python from zero foundation, and what to learn
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'adc.pipelines.AdcPipeline': 300, # Register the class adc.pipelines.AdcPipeline, and the following numeric parameter indicates the execution level.
   'adc.pipelines.imgPipeline': 1, # Register a custom image downloader, the smaller the value, the more priority to execute
}

IMAGES_URLS_FIELD='img'# Sets the url field of the image to be downloaded, which is the field of the image in items.py
lujin = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os. path. join (lujin,'img') # Sets the path to save the picture

Posted by backie on Tue, 13 Aug 2019 04:27:58 -0700