. As shown in the red box below. It mainly uses Python's beutifulsoup and selenium, as well as the cloud coding platform (360 will be required to input the verification code after frequent inquiry of fixed ip, which needs to be charged, 1 cent and 1 code) and Baidu OCR (360's query results, the company is in the form of pictures, so it needs character recognition, with less than 50000 pieces free of charge per day). About 4-8 seconds to deal with a number, only a single process (multi process is meaningless, after all, there is only one fixed ip). We deal with 9000 numbers, starting at 0 and ending at about 10. (python3.7.2)
Cloud coding platform: http://www.yundama.com/apidoc/YDM_SDK.html#demo
Baidu OCR: https://ai.baidu.com/sdk#ocr
1. Modules to be used
#-*- coding: UTF-8 -*- import sys import time import os import re import random import base64 #Baidu ocr Modular from aip import AipOcr import datetime from ctypes import * from selenium import webdriver from pyquery import PyQuery as pq from bs4 import BeautifulSoup from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains default_encoding = 'utf-8' if sys.getdefaultencoding() != default_encoding: reload(sys) sys.setdefaultencoding(default_encoding)
Preparatory work
# To call the cloud coding api, you need to provide account id, api key, user name, password, identification type and timeout # Pay attention to specifying the dll file path of cloud coding YDMApi = windll.LoadLibrary('C:\\phone\\yundamaAPI-x64.dll') appId = Account number id appKey = b'api key' username = b'User name' password = b'Password' # 1004 indicates that the recognition type is 4 letters or numbers codetype = 1004 timeout = 60 # Using selenium chrome_options = webdriver.ChromeOptions() # Use the highest authority mode and the non graphical interface mode chrome_options.add_argument("--no-sandbox") chrome_options.add_argument('--headless') browser=webdriver.Chrome(chrome_options=chrome_options) # Window maximization, not used in non graphical mode browser.maximize_window() # Open 360 query page first url='https://www.so.com/s?q=021' browser.get(url)
phone=number # Some of the sleep in the middle is to ensure that the operation is error free, and it can be adjusted appropriately try: # Locate search box control sousuokuang=browser.find_element_by_id("keyword") time.sleep(0.5) # Search box empty sousuokuang.clear() time.sleep(0.5) # enter number sousuokuang.send_keys(phone) time.sleep(0.5) # Click the search button browser.find_element_by_id("su").submit() time.sleep(random.uniform(0.5,1.3)) # Try to locate the verification code control. If the location fails, enter except. If the location succeeds (indicating that there is a verification code), enter else yanzhengma=browser.find_element_by_id("img") except: # No verification code, query succeeded, enter the result page pass
else: # If there is a verification code, click the verification code picture first (the picture needs to be clicked once before the verification code is displayed) time.sleep(0.3) ActionChains(browser).click(yanzhengma).perform() time.sleep(0.3) # Save captcha picture to local (number. png) yanzhengma.screenshot("c:\\phone\\%s.png" % phone) # Conduct cloud coding (refer to cloud coding document) result = c_char_p(b" ") filename = b'C:\\phone\\%s.png' % phone.encode('gbk') captchaId = YDMApi.YDM_EasyDecodeByPath(username, password, appId, appKey, filename, codetype, timeout, result) # Verification code data decoding acquisition shuruma=(result.value).decode('gbk') # Location verification code input box shurukuang=browser.find_element_by_name("rcode") time.sleep(0.3) # Enter the verification code and click the button shurukuang.send_keys(shuruma) time.sleep(0.3) browser.find_element_by_class_name("btn").submit() # Delete captcha picture os.remove('c:\\phone\\%s.png' % phone) finally: # Read page content and initialize html=browser.page_source data=str(pq(html))
# Read pictures that need ocr recognition def get_file_content(filePath): with open(filePath, 'rb') as fp: return fp.read() # Baidu ocr needs to have appid, apikey, secret key and call function APP_ID = 'appid' API_KEY = 'API key' SECRET_KEY = 'Secret key' client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
# Page information analysis soup = BeautifulSoup(data,"lxml") # Find the div tag with class name Mohe tips soup_div=soup.find('div',{'class':"mohe-tips"}) # If there are Mohe tips if soup_div!=None: # Go to the next level to find the span label soup_spans=soup_div.findAll('span') # If there are 2 span labels (with the marked number, the normal situation is 2 span) if len(soup_spans)==2: # The first span is the number tag (harassing phone, intermediary, etc., using the state variable) state=soup_spans[0].getText().replace('\t','').replace('\n','').replace(' ','') # The second span is the number of tags. If some numbers may not be available, they will be marked as 0 (with num variable) try: num=soup_spans[1].find('b').getText() except: num=0 else: pass # If there is one span tag in the next level search result (the first special case) else: # Rediscover the div tag with class name Mohe tips MH WS hy soup_div=soup.find('div',{'class':"mohe-tips mh-ws-hy"}) # If the search result is not empty if soup_div!=None: # Go to the next level to find the span label soup_spans=soup_div.findAll('span') # The first span is the number mark state=soup_spans[0].getText().replace('\t','').replace('\n','').replace(' ','') # The second span is the number of tags. If some numbers may not be available, they will be marked as 0 try: num=soup_spans[1].find('b').getText() except: num=0 else: pass # If the next level search result is empty, it means that the number is unmarked else: num=u'0' state=u'nothing' # If there is no Mohe tips tag, the second special case else: # Find the div tag named Mohe tips MH WS hy directly soup_div=soup.find('div',{'class':"mohe-tips mh-ws-hy"}) # If the search result is not empty if soup_div!=None: # Go to the next level to find the span label soup_spans=soup_div.findAll('span') # The first span is the number mark state=soup_spans[0].getText().replace('\t','').replace('\n','').replace(' ','') # The second span is the number of tags. If some numbers may not be available, they will be marked as 0. try: num=soup_spans[1].find('b').getText() except: num=0 else: pass #If the next level search result is empty, it means that the number is unmarked else: num=u'0' state=u'nothing'
# Find if there is an img control with class name MH hy img soup_img=soup.find('img',{'class':"mh-hy-img"}) try: # Try to delete the prefix 'data:image/png;base64,' of the img control img_src=soup_img.get("src").replace('data:image/png;base64,','') except: # If the deletion fails, it means there is no company tag, and it is marked as none (with company variable) company=u'nothing' else: # If there is an img control, save the picture locally f = open('c:\\phone\\%s.png' % phone,'wb') f.write(base64.b64decode(img_src)) f.close() # Read the local picture, identify it through Baidu ocr, and delete the picture image = get_file_content('c:\\phone\\%s.png' % phone) company=client.basicGeneral(image)['words_result'][0]['words'] os.remove('c:\\phone\\%s.png' % phone) # In the case of img control without MH hy img, there is a special case if soup_img==None: # Find out if there is a strong control named Mohe tips MH hy soup_strong=soup.find('strong',{'class':"mohe-tips mh-hy"}) try: # Further search for img controls soup_img=soup_strong.find('img') img_src=soup_img.get("src").replace('data:image/png;base64,','') except: # If there is no img control, the company is marked as none company=u'nothing' else: # If there is an img control, process and delete the image recognition f = open('c:\\phone\\%s.png' % phone,'wb') f.write(base64.b64decode(img_src)) f.close() image = get_file_content('c:\\phone\\%s.png' % phone) company=client.basicGeneral(image)['words_result'][0]['words'] os.remove('c:\\phone\\%s.png' % phone)
8. Output result
print phone,state,num,company