IP proxy pool built by python crawler scrape framework

Keywords: Attribute Lambda

1,http://www.xicidaili.com/wt Domestic free agent website

2. Crawl the IP address and port of the website with the help of the scraper and write them into the txt file

3. Write a script to test whether the ip address and port in the txt document are available

4. Enter the available ip address and port into the txt document

------------
1. Write Item class
Since we only need ip address and port, we can write only one attribute

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class IpItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

class IpInfoItem(scrapy.Item):
    ip=scrapy.Field()

2. Write spider

# -*- coding: utf-8 -*-
import scrapy
import sys
sys.path.append("D:\\pycodes\\ip")
from ip.items import IpInfoItem

class IpSpider(scrapy.Spider):
    name = 'Ip'

    start_urls = []
    #Crawling IP of 5-page website
    for i in range(1,6):
        start_urls.append('http://www.xicidaili.com/wt/'+str(i))

    def parse(self, response):

        item = IpInfoItem()

        for sel in response.xpath('//tr'):
            ip= sel.xpath('.//td[2]/text()').extract_first()
            port=sel.xpath('.//td[3]/text()').extract_first()
            item['ip']=str(ip)+":"+str(port)

            yield item

3. Write pipeline

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class IpPipeline(object):
    def process_item(self, item, spider):
        return item


class IpInfoPipeline(object):


    def process_item(self,item,spider):
        try:
        #We only need the IP address and port, so we only write the dictionary value into the txt file
            content = item['ip']
            open("xinresult.txt","a").write(content+"\n")

        except:
            pass
        return item

So far, we have climbed down 5 pages of IP from the website and need to write a script for testing

import requests

alive_ip=[]

def test_alive(proxy):

    global alive_ip
    for proxies_be in proxy:
        #The IP address in the request needs to be written into the function with the following parameters
        proxies={"http":proxies_be}

        print("Testing:{}".format(proxies))
        try:
            r = requests.get("http://www.baidu.com",proxies=proxies,timeout=2)
            if r.status_code==200:
                print("Success,ip by{}".format(proxies))
                alive_ip.append(proxies_be)
            else:
                print("fail")
        except:
            print("fail")


def out_file(alive_ip=[]):

    with open ("alive_ip.txt","w") as f:
        for ip in alive_ip:
            f.write(str(ip)+"\n")
        print("Output complete")

def test(filename="blank.txt"):

    with open(filename,"r") as f:
        lines = f.readlines()

        proxys=list(map(lambda x:x.strip(),lines))

        test_alive(proxys)

    out_file(alive_ip)


test("xinresult.txt")

Posted by jtmathome on Thu, 30 Apr 2020 19:48:04 -0700