1,http://www.xicidaili.com/wt Domestic free agent website
2. Crawl the IP address and port of the website with the help of the scraper and write them into the txt file
3. Write a script to test whether the ip address and port in the txt document are available
4. Enter the available ip address and port into the txt document
------------
1. Write Item class
Since we only need ip address and port, we can write only one attribute
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class IpItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class IpInfoItem(scrapy.Item):
ip=scrapy.Field()
2. Write spider
# -*- coding: utf-8 -*-
import scrapy
import sys
sys.path.append("D:\\pycodes\\ip")
from ip.items import IpInfoItem
class IpSpider(scrapy.Spider):
name = 'Ip'
start_urls = []
#Crawling IP of 5-page website
for i in range(1,6):
start_urls.append('http://www.xicidaili.com/wt/'+str(i))
def parse(self, response):
item = IpInfoItem()
for sel in response.xpath('//tr'):
ip= sel.xpath('.//td[2]/text()').extract_first()
port=sel.xpath('.//td[3]/text()').extract_first()
item['ip']=str(ip)+":"+str(port)
yield item
3. Write pipeline
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class IpPipeline(object):
def process_item(self, item, spider):
return item
class IpInfoPipeline(object):
def process_item(self,item,spider):
try:
#We only need the IP address and port, so we only write the dictionary value into the txt file
content = item['ip']
open("xinresult.txt","a").write(content+"\n")
except:
pass
return item
So far, we have climbed down 5 pages of IP from the website and need to write a script for testing
import requests
alive_ip=[]
def test_alive(proxy):
global alive_ip
for proxies_be in proxy:
#The IP address in the request needs to be written into the function with the following parameters
proxies={"http":proxies_be}
print("Testing:{}".format(proxies))
try:
r = requests.get("http://www.baidu.com",proxies=proxies,timeout=2)
if r.status_code==200:
print("Success,ip by{}".format(proxies))
alive_ip.append(proxies_be)
else:
print("fail")
except:
print("fail")
def out_file(alive_ip=[]):
with open ("alive_ip.txt","w") as f:
for ip in alive_ip:
f.write(str(ip)+"\n")
print("Output complete")
def test(filename="blank.txt"):
with open(filename,"r") as f:
lines = f.readlines()
proxys=list(map(lambda x:x.strip(),lines))
test_alive(proxys)
out_file(alive_ip)
test("xinresult.txt")