scrapy framework crawls all sister pictures.

Keywords: Python Windows

Crawl all the pictures and create a folder for the pictures of a page. Difficulties, there are many. gif pictures in the picture, which need to rewrite the download rules.

Create scrapy projects

scrapy startproject qiumeimei

Creating crawler applications

cd qiumeimei

scrapy genspider -t crawl qmm www.xxx.com

Define download fields in items.py file

import scrapy


class QiumeimeiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    page = scrapy.Field()
    image_url = scrapy.Field()

Write crawler main program in qmm.py file

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from qiumeimei.items import QiumeimeiItem

class QmmSpider(CrawlSpider):
    name = 'qmm'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://www.qiumeimei.com/image']

    rules = (
        Rule(LinkExtractor(allow=r'http://www.qiumeimei.com/image/page/\d+'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        page = response.url.split('/')[-1]
        if not page.isdigit():
            page = '1'
        image_urls = response.xpath('//div[@class="main"]/p/img/@data-lazy-src').extract()
        for image_url in image_urls:
            item = QiumeimeiItem()
            item['image_url'] = image_url
            item['page'] = page
            yield item

Define download rules in pipelines.py file

import scrapy
import os
from scrapy.utils.misc import md5sum
# Import scrapy Pipeline file in frame for image processing
from scrapy.pipelines.images import ImagesPipeline
# Import Image Path Name
from qiumeimei.settings import IMAGES_STORE as images_store
# Must inherit ImagesPipeline
class QiumeimeiPipeline(ImagesPipeline):
    # Define the return file name
    def file_path(self, request, response=None, info=None):
        file_name = request.url.split('/')[-1]
        return file_name
    # Method of overwriting downloaded files of parent classes
    def get_media_requests(self, item, info):
        yield scrapy.Request(url=item['image_url'])
    #     Method name for completing image storage
    def item_completed(self, results, item, info):
        # print(results)
        page = item['page']
        print('Downloading section'+page+'Page picture')
        image_url = item['image_url']
        image_name = image_url.split('/')[-1]
        old_name_list = [x['path'] for t, x in results if t]
        # The Storage Path of Real Original Pictures
        old_name = images_store + old_name_list[0]
        image_path = images_store + page + "/"
        # Judging whether the catalog for image storage exists
        if not os.path.exists(image_path):
            # Create the corresponding directory based on the current page number
            os.mkdir(image_path)
        # New name
        new_name = image_path + image_name
        # rename
        os.rename(old_name, new_name)
        return item
    # Rewrite download rules
    def image_downloaded(self, response, request, info):
        checksum = None
        for path, image, buf in self.get_images(response, request, info):
            if checksum is None:
                buf.seek(0)
                checksum = md5sum(buf)
            width, height = image.size
            if self.check_gif(image):
                self.persist_gif(path, response.body, info)
            else:
                self.store.persist_file(
                    path, buf, info,
                    meta={'width': width, 'height': height},
                    headers={'Content-Type': 'image/jpeg'})
        return checksum

    def check_gif(self, image):
        if image.format is None:
            return True

    def persist_gif(self, key, data, info):
        root, ext = os.path.splitext(key)
        absolute_path = self.store._get_filesystem_path(key)
        self.store._mkdir(os.path.dirname(absolute_path), info)
        f = open(absolute_path, 'wb')  # use 'b' to write binary data.
        f.write(data)

Define the request header and open the download pipeline in the settings.py file

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'

ITEM_PIPELINES = {
   'qiumeimei.pipelines.QiumeimeiPipeline': 300,
}

Run crawler

scrapy crawl qmm --nolog

Check whether the folder was successfully downloaded

gif is a dynamic graph.

done.

Posted by enemeth on Sat, 05 Oct 2019 08:09:44 -0700