Write spiders crawler file to capture content in a loop
Request() method, which adds the specified url address to the download page of the downloader, with two necessary parameters.
Parameters:
url='url'
callback = page handler
yield Request()
parse.urljoin() method is a method under urllib library. It is automatic url splicing. If the url address of the second parameter is relative path, it will automatically splice with the first parameter.
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request #Method of importing url back to Downloader from urllib import parse #Import parse module in urllib Library class PachSpider(scrapy.Spider): name = 'pach' allowed_domains = ['blog.jobbole.com'] #Initial domain name start_urls = ['http://blog.jobbole.com/all-posts/'] start url def parse(self, response): """ //Get the url address of the list page and hand it to the downloader """ #Get the current page article url lb_url = response.xpath('//A [@class= "archive-title"]/@href'. extract ()# Gets the list url of articles for i in lb_url: # print(parse.urljoin(response.url,i)) #The urljoin() method of parse module in urllib library is automatic url splicing. If the url address of the second parameter is relative path, it will automatically splice with the first parameter. yield Request(url=parse.urljoin(response.url, i), callback=self.parse_wzhang) #Add the circular article url to the downloader and hand it to the parse_wzhang callback function after downloading. #Get the next page list url, hand it to the downloader, and return it to the parse function loop x_lb_url = response.xpath('//A [@class= "next page-numbers"]/@href'. extract ()# Gets the next page article list url if x_lb_url: yield Request(url=parse.urljoin(response.url, x_lb_url[0]), callback=self.parse) #Get the url on the next page and return it to the downloader. Callback to the parse function and loop through it. def parse_wzhang(self,response): title = response.xpath('//Div [@class= "entry-header"]/h1/text ()'. extract ()# Gets the title of the article print(title)
Python Resource Sharing Skirt: 855408893 has installation packages, learn video materials, update technology every day. Here is the gathering place of Python learners, zero foundation, advanced, welcome to click Python resource sharing
When the Request() function returns the url, it can also return a custom dictionary to the callback function through the meta attribute.
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request #Method of importing url back to Downloader from urllib import parse #Import parse module in urllib Library from adc.items import AdcItem #Importing the Receiving Class of the items Data Receiving Module class PachSpider(scrapy.Spider): name = 'pach' allowed_domains = ['blog.jobbole.com'] #Initial domain name start_urls = ['http://blog.jobbole.com/all-posts/'] start url def parse(self, response): """ //Get the url address of the list page and hand it to the downloader """ #Get the current page article url lb = response.css('div .post.floated-thumb') #Get the article list block, css selector # print(lb) for i in lb: lb_url = i.css('.archive-title ::attr(href)').extract_first('') #Get the article url in the block # print(lb_url) lb_img = i.css('.post-thumb img ::attr(src)').extract_first('') #Get article thumbnails in blocks # print(lb_img) yield Request(url=parse.urljoin(response.url, lb_url), meta={'lb_img':parse.urljoin(response.url, lb_img)}, callback=self.parse_wzhang) #Add the circular article url to the downloader and hand it to the parse_wzhang callback function after downloading. #Get the next page list url, hand it to the downloader, and return it to the parse function loop x_lb_url = response.css('.next.page-numbers ::attr(href)').extract_first('') #Get the next page article list url if x_lb_url: yield Request(url=parse.urljoin(response.url, x_lb_url), callback=self.parse) #Get the url on the next page and return it to the downloader. Callback to the parse function and loop through it. def parse_wzhang(self,response): title = response.css('.entry-header h1 ::text').extract() #Get the title of the article # print(title) tp_img = response.meta.get('lb_img', '') #Receive meta-passed values and use get to prevent errors # print(tp_img) shjjsh = AdcItem() #Instantiated Data Receiving Class shjjsh['title'] = title #Data transmission to the specified class of items receiving module shjjsh['img'] = tp_img yield shjjsh #Return the receiving object to pipelines.py processing module
Scrapy's built-in Image Downloader uses
Scrapy gives us a built-in image downloader, crapy. pipelines. images. Images Pipeline, which is designed to download images locally after the crawler grabs the url of the image.
First step, after the crawler grabs the image URL address, it fills in the container function of items.py file.
Crawler files
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request #Method of importing url back to Downloader from urllib import parse #Import parse module in urllib Library from adc.items import AdcItem #Importing the Receiving Class of the items Data Receiving Module class PachSpider(scrapy.Spider): name = 'pach' allowed_domains = ['blog.jobbole.com'] #Initial domain name start_urls = ['http://blog.jobbole.com/all-posts/'] start url def parse(self, response): """ //Get the url address of the list page and hand it to the downloader """ #Get the current page article url lb = response.css('div .post.floated-thumb') #Get the article list block, css selector # print(lb) for i in lb: lb_url = i.css('.archive-title ::attr(href)').extract_first('') #Get the article url in the block # print(lb_url) lb_img = i.css('.post-thumb img ::attr(src)').extract_first('') #Get article thumbnails in blocks # print(lb_img) yield Request(url=parse.urljoin(response.url, lb_url), meta={'lb_img':parse.urljoin(response.url, lb_img)}, callback=self.parse_wzhang) #Add the circular article url to the downloader and hand it to the parse_wzhang callback function after downloading. #Get the next page list url, hand it to the downloader, and return it to the parse function loop x_lb_url = response.css('.next.page-numbers ::attr(href)').extract_first('') #Get the next page article list url if x_lb_url: yield Request(url=parse.urljoin(response.url, x_lb_url), callback=self.parse) #Get the url on the next page and return it to the downloader. Callback to the parse function and loop through it. def parse_wzhang(self,response): title = response.css('.entry-header h1 ::text').extract() #Get the title of the article # print(title) tp_img = response.meta.get('lb_img', '') #Receive meta-passed values and use get to prevent errors # print(tp_img) shjjsh = AdcItem() #Instantiated Data Receiving Class shjjsh['title'] = title #Data transmission to the specified class of items receiving module shjjsh['img'] = [tp_img] yield shjjsh #Return the receiving object to pipelines.py processing module
Step 2: Setting up items.py Container function of file to receive data filled by crawler
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy #Item. py, the file is specially used to receive data information from the crawler, which is equivalent to container file. class AdcItem(scrapy.Item): #Setting the information container class that the crawler gets title = scrapy.Field() #Receiving title information from Crawlers img = scrapy.Field() #Receive thumbnails img_tplj = scrapy.Field() #Picture save path
Step 3: Using crapy's built-in Image Downloader in pipelines.py
1. Introduce the built-in Image Downloader first
2. Customize an image download, inherit crapy's built-in ImagesPipeline Image Downloader class
3. Use the item_completed() method in the ImagesPipeline class to get the saved path of the image after downloading.
4. In the settings.py settings file, register the Custom Image Downloader class and set the path to save the image.
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.pipelines.images import ImagesPipeline #Import Picture Downloader Module class AdcPipeline(object): #To define data processing classes, you must inherit object s def process_item(self, item, spider): #process_item(item) is a data processing function that receives an item, which is the data object from the last yield item of the crawler. print('The title of the article is:' + item['title'][0]) print('Post Thumbnails url Yes,' + item['img'][0]) print('The path to save the article thumbnails is:' + item['img_tplj']) #Receive the path filled by the Image Downloader after the image download return item class imgPipeline(ImagesPipeline): #Customize an image download, inherit crapy's built-in ImagesPipeline Image Downloader class def item_completed(self, results, item, info): #Use the item_completed() method in the ImagesPipeline class to get the saved path of the downloaded image for ok, value in results: img_lj = value['path'] #Receiving Picture Save Path # print(ok) item['img_tplj'] = img_lj #Fill the image save path into the fields in items.py return item #Container functions that give item to items.py files #Note: After setting up the custom image downloader, you need to
In the settings.py settings file, register the Custom Image Downloader class and set the image save path
IMAGES_URLS_FIELD sets the url address of the image to be downloaded and the fields received in items.py
IMAGES_STORE Sets Picture Save Path
What I don't know in the process of learning can be added to me? python learning resource qun, 855 408 893 There are good learning video tutorials, development tools and e-books in the group. Share with you the current talent needs of python enterprises and how to learn python from zero foundation, and what to learn # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'adc.pipelines.AdcPipeline': 300, # Register the class adc.pipelines.AdcPipeline, and the following numeric parameter indicates the execution level. 'adc.pipelines.imgPipeline': 1, # Register a custom image downloader, the smaller the value, the more priority to execute } IMAGES_URLS_FIELD='img'# Sets the url field of the image to be downloaded, which is the field of the image in items.py lujin = os.path.abspath(os.path.dirname(__file__)) IMAGES_STORE = os. path. join (lujin,'img') # Sets the path to save the picture