python crawler project

Keywords: Python Redis Pycharm

python crawler scratch project (2)

Crawling target: fangtianxia national rental information website (starting url: http://zu.fang.com/cities.aspx)

Crawling content: City; name; rental method; price; house type; area; address; transportation

Anti creep measures: set random user agent, set request delay operation

1. Start to create project

1 scrapy startproject fang

2. Enter the fang folder, execute the code to start the spider crawler file, and write the crawler file.

1 scrapy genspider zufang "zu.fang.com"

After the command is executed, open the file directory with Python's best IDE---pycharm

3. Write the items.py file in the directory, and set the fields you need to crawl.

 1 import scrapy
 2 
 3 
 4 class HomeproItem(scrapy.Item):
 5     # define the fields for your item here like:
 6     # name = scrapy.Field()
 7 
 8     city = scrapy.Field()  #City
 9     title = scrapy.Field()  # Name
10     rentway = scrapy.Field()  # Rental mode
11     price = scrapy.Field()    #Price
12     housetype = scrapy.Field()  # Apartment
13     area = scrapy.Field()  # The measure of area
14     address = scrapy.Field()  # address
15     traffic = scrapy.Field()  # traffic

4. Enter spiders folder, open hr.py file, and start to write crawler file

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from homepro.items import HomeproItem
 4 from scrapy_redis.spiders import RedisCrawlSpider
 5 # scrapy.Spider
 6 class HomeSpider(RedisCrawlSpider):
 7     name = 'home'
 8     allowed_domains = ['zu.fang.com']
 9     # start_urls = ['http://zu.fang.com/cities.aspx']
10     
11     redis_key = 'homespider:start_urls'
12     def parse(self, response):
13         hrefs = response.xpath('//div[@class="onCont"]/ul/li/a/@href').extract()
14         for href in hrefs:
15             href = 'http:'+ href
16             yield scrapy.Request(url=href,callback=self.parse_city,dont_filter=True)
17 
18 
19     def parse_city(self, response):
20         page_num = response.xpath('//div[@id="rentid_D10_01"]/span[@class="txt"]/text()').extract()[0].strip('Common page')
21         # print('*' * 100)
22         # print(page_num)
23         # print(response.url)
24 
25         for page in range(1, int(page_num)):
26             if page == 1:
27                 url = response.url
28             else:
29                 url = response.url + 'house/i%d' % (page + 30)
30             print('*' * 100)
31             print(url)
32             yield scrapy.Request(url=url, callback=self.parse_houseinfo, dont_filter=True)
33 
34     def parse_houseinfo(self, response):
35         divs = response.xpath('//dd[@class="info rel"]')
36         for info in divs:
37             city = info.xpath('//div[@class="guide rel"]/a[2]/text()').extract()[0].rstrip("Renting")
38             title = info.xpath('.//p[@class="title"]/a/text()').extract()[0]
39             rentway = info.xpath('.//p[@class="font15 mt12 bold"]/text()')[0].extract().replace(" ", '').lstrip('\r\n')
40             housetype = info.xpath('.//p[@class="font15 mt12 bold"]/text()')[1].extract().replace(" ", '')
41             area = info.xpath('.//p[@class="font15 mt12 bold"]/text()')[2].extract().replace(" ", '')
42             addresses = info.xpath('.//p[@class ="gray6 mt12"]//span/text()').extract()
43             address = '-'.join(i for i in addresses)
44             try:
45                 des = info.xpath('.//p[@class ="mt12"]//span/text()').extract()
46                 traffic = '-'.join(i for i in des)
47             except Exception as e:
48                 traffic = "No details"
49 
50             p_name = info.xpath('.//div[@class ="moreInfo"]/p/text()').extract()[0]
51             p_price = info.xpath('.//div[@class ="moreInfo"]/p/span/text()').extract()[0]
52             price = p_price + p_name
53 
54             item = HomeproItem()
55             item['city'] = city
56             item['title'] = title
57             item['rentway'] = rentway
58             item['price'] = price
59             item['housetype'] = housetype
60             item['area'] = area
61             item['address'] = address
62             item['traffic'] = traffic
63             yield item

5. Set the setting.py file and configure the relevant content of the operation of the summary

 1 # Designated use scrapy-redis Scheduler
 2 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 3 
 4 # Designated use scrapy-redis De weight
 5 DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
 6 
 7 # Specifies the queue to use when sorting crawl addresses,
 8 # Default sort by priority(Scrapy default),from sorted set A kind of non realization FIFO,LIFO Way.
 9 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
10 
11 REDIS_HOST = '10.8.153.73'
12 REDIS_PORT = 6379 
13 # Whether to keep the original scheduler and de duplication record when shutting down, True=Retain, False=empty
14 SCHEDULER_PERSIST = True                                            

6. Then send the code to other attached machines and start the. Subprogram redis to link the main server redis.

1 redis cli-h main server ip

7. The main server starts redis server first, and then redis cli

1 lpush homespider:start_urls start url 

Posted by stelthius on Wed, 04 Dec 2019 03:37:38 -0800