Python 3 reptile (15) agent

Keywords: Python socket Selenium Redis github

 Infi-chu:

http://www.cnblogs.com/Infi-chu/

1, Set up proxy

1.urllib

#HTTP proxy type
from urllib.error import URLError
from urllib.requests import ProxyHandler,build_opener
proxy='127.0.0.1:9743'
# proxy='username:password@127.0.0.1:9743 'username and password are placed at the beginning
proxy_handler=ProxyHandler({
	'http':'http://'+proxy,
	'https':'https://'+proxy
})
opener=build_opener(proxy_handler)
try:
    res = opener.open('http://httpbin.org/get')
	print(res.read().decode('uft-8'))
except URLError as e:
	print(e.reason)
#SOCK5 agent type
import socks	# pip3 install PySocks
import socket
from urllib import request
from urllib.error import URLError
socks.set_default_proxy(socks.SOCKS5,'127.0.0.1',9742)
socket.socket=socks.socksocket
try:
    res = request.urlopen('http://httpbin.org/get')
	print(res.read().decode('utf-8'))
except URLError as e:
	print(e.reason)

2.requests
Simpler than urllib

# HTTP proxy type
improt requests
proxy='127.0.0.1:9743'
proxies = {
	'http':'http://'+proxy,
	'https':'https://'+proxy,
}
try:
    res = requests.get('http://httpbin.org/get',proxies=proxies)
	print(res.text)
except requests.exceptions.ConnectionError as e:
    print('Error',e.args)

# SOCK5 agent type (1)
import requests    # pip3 install 'requests[socks]'
proxy='127.0.0.1:9742'
proxies={
	'http':'socks5://'+proxy,
	'https':'socks5://'+proxy,
}
try:
    res = requests.get('http://httpbin.org/get',proxies=proxies)
	print(res.text)
except requests.exceptions.ConnectionError as e:
    print('Error',e.args)
# SOCK5 agent type (2)
import requests,socks,socket
socks.set_default_proxy(socks.SOCKS5,'127.0.0.1',9742)
socket.socket=socks.socksocket
try:
    res = requests.get('http://httpbin.org/get',proxies=proxies)
	print(res.text)
except requests.exceptions.ConnectionError as e:
    print('Error',e.args)

3.Selenium
Set up browser proxy

from selenium import webdriver
proxy='127.0.0.1:9743'
chrome_options=webdriver.ChromeOptions()	# Use this method to pass parameters
chrome_options.add_argument('--proxy-server=http://'+proxy)
browser=webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://httpbin.org/get')

Set up authentication agent

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import zipfile

ip='127.0.0.1'
port=9743
username='test'
password='test'
manifest_json="""
{
	"version":"1.0.0",
	"manifest_version":2,
	"name":"Chrome Proxy",
	"permissions":[
	"proxy",
	"tabs",
	"unlimitedStorage",
	"storage",
	"<all_urls>",
	"webRequest",
	"webRequestBlocking"
	],
	"background":{"scripts":["background.js"]}
}
"""
background_js="""
var config={
	mode:"fixed_servers",
	rules:{
		singleProxy:{
			scheme:"http",
			host:"%(ip)s",
			port:"%(port)s"
		}
	}
}

chrome.proxy.settings.set({value:config,scope:"regular"},function(){});
function callbackFn(details){
	return{
		authCredentials:{
			username:"%(username)s",
			password:"%(password)s"
		}
	}
}	
chrome.webRequest.onAuthRequired.addListener(
	callbackFn,
	{urls:["<all_urls>"]},
	['blocking']
)
"""%{'ip':ip,'port':port,'username':username,'port':port}
plugin_file='proxy_auth_plugin.zip'
with zipfile.ZipFile(plugin_file,'w') as zp:
    zp.writestr("manifest_json",manifest_json)
	zp.writestr("background.js",background_js)
chrome_options=Options()
chrome_options.add_argument('--start-maximized')
chrome_options.add_extension(plugin_file)
browser=webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://httpbin.org/get')

2, Agent pool maintenance
Single agent can not complete our agent task, so we need more agents to serve us.
We will filter the agents and provide efficient services for us.
1. preparation
You need to use the redis database, aiohttp, requests, redis py, pyquery, and flask libraries
2. Target of agent pool: enclosure, acquisition module, detection module and interface module
3. Realization of each module:

https://github.com/Infi-chu/proxypool

3, Using proxy to crawl wechat articles

https://github.com/Infi-chu/weixinspider

Posted by minus4 on Fri, 20 Mar 2020 08:45:12 -0700