selenium + cloud calling code + Baidu ocr crawling 360's phone number mark

Keywords: Selenium Python less SDK

. As shown in the red box below. It mainly uses Python's beutifulsoup and selenium, as well as the cloud coding platform (360 will be required to input the verification code after frequent inquiry of fixed ip, which needs to be charged, 1 cent and 1 code) and Baidu OCR (360's query results, the company is in the form of pictures, so it needs character recognition, with less than 50000 pieces free of charge per day). About 4-8 seconds to deal with a number, only a single process (multi process is meaningless, after all, there is only one fixed ip). We deal with 9000 numbers, starting at 0 and ending at about 10. (python3.7.2)

Cloud coding platform: http://www.yundama.com/apidoc/YDM_SDK.html#demo
Baidu OCR: https://ai.baidu.com/sdk#ocr

1. Modules to be used

#-*- coding: UTF-8 -*-
import sys
import time
import os
import re
import random
import base64
＃Baidu ocr Modular
from aip import AipOcr
import datetime
from ctypes import *
from selenium import webdriver
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

Preparatory work

# To call the cloud coding api, you need to provide account id, api key, user name, password, identification type and timeout
# Pay attention to specifying the dll file path of cloud coding
YDMApi = windll.LoadLibrary('C:\\phone\\yundamaAPI-x64.dll')
appId = Account number id
appKey = b'api key'
username = b'User name'
password = b'Password'
# 1004 indicates that the recognition type is 4 letters or numbers
codetype = 1004
timeout = 60

# Using selenium
chrome_options = webdriver.ChromeOptions()
# Use the highest authority mode and the non graphical interface mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('--headless')
browser=webdriver.Chrome(chrome_options=chrome_options)

# Window maximization, not used in non graphical mode
browser.maximize_window()

# Open 360 query page first
url='https://www.so.com/s?q=021'
browser.get(url)

phone=number
# Some of the sleep in the middle is to ensure that the operation is error free, and it can be adjusted appropriately
try:
    # Locate search box control
    sousuokuang=browser.find_element_by_id("keyword")
    time.sleep(0.5)
    # Search box empty
    sousuokuang.clear()
    time.sleep(0.5)
    # enter number
    sousuokuang.send_keys(phone)
    time.sleep(0.5)
    # Click the search button
    browser.find_element_by_id("su").submit()
    time.sleep(random.uniform(0.5,1.3))
    # Try to locate the verification code control. If the location fails, enter except. If the location succeeds (indicating that there is a verification code), enter else
    yanzhengma=browser.find_element_by_id("img")
except:
    # No verification code, query succeeded, enter the result page
    pass

else:
    # If there is a verification code, click the verification code picture first (the picture needs to be clicked once before the verification code is displayed)
    time.sleep(0.3)
    ActionChains(browser).click(yanzhengma).perform()
    time.sleep(0.3)
    # Save captcha picture to local (number. png)
    yanzhengma.screenshot("c:\\phone\\%s.png" % phone)
    # Conduct cloud coding (refer to cloud coding document)
    result = c_char_p(b"                              ")    
    filename = b'C:\\phone\\%s.png' % phone.encode('gbk')
    captchaId = YDMApi.YDM_EasyDecodeByPath(username, password, appId, appKey, filename, codetype, timeout, result)

    # Verification code data decoding acquisition
    shuruma=(result.value).decode('gbk')
    # Location verification code input box
    shurukuang=browser.find_element_by_name("rcode")
    time.sleep(0.3)
    # Enter the verification code and click the button
    shurukuang.send_keys(shuruma)
    time.sleep(0.3)
    browser.find_element_by_class_name("btn").submit()
    # Delete captcha picture
    os.remove('c:\\phone\\%s.png' % phone)
finally:
    # Read page content and initialize
    html=browser.page_source
    data=str(pq(html))

# Read pictures that need ocr recognition
def get_file_content(filePath):
    with open(filePath, 'rb') as fp:
        return fp.read()

# Baidu ocr needs to have appid, apikey, secret key and call function
APP_ID = 'appid'
API_KEY = 'API key'
SECRET_KEY = 'Secret key'
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)

# Page information analysis
soup = BeautifulSoup(data,"lxml")
# Find the div tag with class name Mohe tips
soup_div=soup.find('div',{'class':"mohe-tips"})
# If there are Mohe tips
if soup_div!=None:
    # Go to the next level to find the span label
    soup_spans=soup_div.findAll('span')
    # If there are 2 span labels (with the marked number, the normal situation is 2 span)
    if len(soup_spans)==2:
        # The first span is the number tag (harassing phone, intermediary, etc., using the state variable)
        state=soup_spans[0].getText().replace('\t','').replace('\n','').replace(' ','')

        # The second span is the number of tags. If some numbers may not be available, they will be marked as 0 (with num variable)
        try:
            num=soup_spans[1].find('b').getText()
        except:
            num=0
        else:
            pass

    # If there is one span tag in the next level search result (the first special case)
    else:
        # Rediscover the div tag with class name Mohe tips MH WS hy
        soup_div=soup.find('div',{'class':"mohe-tips mh-ws-hy"})
        # If the search result is not empty
        if soup_div!=None:
            # Go to the next level to find the span label
            soup_spans=soup_div.findAll('span')
            # The first span is the number mark
            state=soup_spans[0].getText().replace('\t','').replace('\n','').replace(' ','')

            # The second span is the number of tags. If some numbers may not be available, they will be marked as 0
            try:
                num=soup_spans[1].find('b').getText()
            except:
                num=0
            else:
                pass
        # If the next level search result is empty, it means that the number is unmarked
        else:
            num=u'0'
            state=u'nothing'

# If there is no Mohe tips tag, the second special case
else:
    # Find the div tag named Mohe tips MH WS hy directly
    soup_div=soup.find('div',{'class':"mohe-tips mh-ws-hy"})
    # If the search result is not empty
    if soup_div!=None:
        # Go to the next level to find the span label
        soup_spans=soup_div.findAll('span')
        # The first span is the number mark
        state=soup_spans[0].getText().replace('\t','').replace('\n','').replace(' ','')
        # The second span is the number of tags. If some numbers may not be available, they will be marked as 0.
        try:
            num=soup_spans[1].find('b').getText()
        except:
            num=0
        else:
            pass
    #If the next level search result is empty, it means that the number is unmarked
    else:
        num=u'0'
        state=u'nothing'

# Find if there is an img control with class name MH hy img
soup_img=soup.find('img',{'class':"mh-hy-img"})
try:
    # Try to delete the prefix 'data:image/png;base64,' of the img control
    img_src=soup_img.get("src").replace('data:image/png;base64,','')
except:
    # If the deletion fails, it means there is no company tag, and it is marked as none (with company variable)
    company=u'nothing'
else:
    # If there is an img control, save the picture locally
    f = open('c:\\phone\\%s.png' % phone,'wb')
    f.write(base64.b64decode(img_src))
    f.close()
    # Read the local picture, identify it through Baidu ocr, and delete the picture
    image = get_file_content('c:\\phone\\%s.png' % phone)
    company=client.basicGeneral(image)['words_result'][0]['words']
    os.remove('c:\\phone\\%s.png' % phone)
# In the case of img control without MH hy img, there is a special case
if soup_img==None: 
    # Find out if there is a strong control named Mohe tips MH hy
    soup_strong=soup.find('strong',{'class':"mohe-tips mh-hy"})
    try:
        # Further search for img controls
        soup_img=soup_strong.find('img')
        img_src=soup_img.get("src").replace('data:image/png;base64,','')
    except:
        # If there is no img control, the company is marked as none
        company=u'nothing'
    else:
        # If there is an img control, process and delete the image recognition
        f = open('c:\\phone\\%s.png' % phone,'wb')
        f.write(base64.b64decode(img_src))
        f.close()
        image = get_file_content('c:\\phone\\%s.png' % phone)
        company=client.basicGeneral(image)['words_result'][0]['words']
        os.remove('c:\\phone\\%s.png' % phone)

8. Output result

print phone,state,num,company

Posted by graphic3 on Sun, 10 Nov 2019 08:11:31 -0800

Programmer Group

selenium + cloud calling code + Baidu ocr crawling 360's phone number mark

1. Modules to be used

Preparatory work

8. Output result

Hot Keywords