Designated email reading function of Python 3 + scratch + selenium Tencent enterprise mailbox

Keywords: Selenium JSON Django Javascript

1. Recently, the manager was annoyed by the email, maybe too much, and wanted to read in batches and process the required data and send it to his interface, but Tencent's js was too many, too difficult, too much to read, too much to understand, manual dog head. So I'm going to use selenium to simulate the browser

2. Because it needs to crawl regularly every day, the framework is used here. In fact, the main function of,,, is for the convenience of timing. Just publish directly. There was a timing framework before, so we used the method of "scratch". If you are not a regular friend, just use the way you like

3. Import some libraries first, it seems that some are not used.. If it's gray, just remove it or comment it~

import json
import re
import time

import requests
import scrapy
from django.http import request, HttpResponse
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver import ChromeOptions, DesiredCapabilities

4. Here's the crawler code. Hum, don't spray me. Write it at start_ It's more convenient in requests... Don't ask me why.. I'm lazy.. But I'll give you some notes~

5. There is a pit here. Notice the above browser.switch_to.frame("mainFrame"), you must jump to this iframe here, otherwise you will report an error if you can't find the next element, and it's also necessary to wait for 3 seconds, otherwise you can't find it

class TEMAILSpider(scrapy.Spider):
    name = 'Temail'
    allowed_domains = []

    start_urls = []
    custom_settings = {'CONCURRENT_REQUESTS': '10', }#Number of threads~

    def chuliinfo(self, shuju):
        shuju = ''.join(shuju)
        shuju = shuju.replace('\n', '').replace(' ', '')
        return shuju

    def start_requests(self):
        option = ChromeOptions()
        option.add_experimental_option('excludeSwitches', ['enable-automation'])
        option.add_experimental_option('useAutomationExtension', False)
        # It's much faster to open headless here
        # option.add_argument('--headless')
        option.add_argument('--disable-gpu')
        # This is to disable loading and speed up
        # prefs = {

        #     'profile.default_content_setting_values': {
        #         'images': 2,#2 this state is to disable loading pictures
        # #         # 'javascript': 2
        #     }
        # }
        # option.add_experimental_option('prefs', prefs)
        # Google browser
        desired_capabilities = DesiredCapabilities.CHROME
        desired_capabilities["pageLoadStrategy"] = "none"

        browser = webdriver.Chrome(options=option, desired_capabilities=desired_capabilities)
        browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument',
                                {'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'})

        browser.get("https://exmail.qq.com/login")  # get accept url can be any url, here take Baidu as an example
        wait = WebDriverWait(browser, 30)
        element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="loginForm"]/div[3]/div[3]/a[1]'))).click()  # Its role is now, and so on!!!!!!!
        input_user = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="inputuin"]'))).send_keys(
            'Enter your account here')  # Enter account number
        input_pwd = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="pp"]'))).send_keys('Enter your password here')  # Input password
        bt_login = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="btlogin"]'))).click()  # Click the login button
        # Here are the waiting times for your inbox
        # time.sleep(6)
        folder = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="folder_1"]'))).click()  # Click inbox

        browser.switch_to.frame("mainFrame")
         # Locate recipients
        time.sleep(3)
     

6. The following code can be added or not, you can directly select the number of pages you want to jump, otherwise it is from the first page. It is recommended to try the operation~

  #The next one is for page skipping
        maillistjump = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="maillistjump"]'))).click()  # Jump
        jumppage = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="list"]/div[1]//input'))).send_keys(
            '10')  # Enter account number  # Jump pages
        mljump = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="list"]/div[1]//a'))).click()  # Jump button
        browser.switch_to.default_content()
        browser.switch_to.frame("mainFrame")
        time.sleep(3)

7. The next step is the key analysis and circulation, which is one of them time.sleep I'm crying. Whoa

  # print(len(email_list))
        for i in range(1000):
            if i != 0:
                browser.switch_to.frame("mainFrame")
                time.sleep(3)

            email_list = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="frm"]/div[2]/table')))  # All messages on one page
            # email_list1 = WebDriverWait(browser, 5).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@ id="frm"]/div[3]/table ')) (another case)
            email_list1=browser.find_elements_by_xpath('//*[@id="frm"]/div[3]/table')
            # email_list = browser.find_elements_by_xpath('//*[@id="frm"]/div[2]/table')  # //tr/td[3]/table//tr/td[3]/div[1]/u
            # email_title =browser.find_element_by_xpath('//*[@id="frm"]/div[2]/table//tr/td[3]/table//tr/td[3]/div[1]/u').text
            for index1, email in enumerate(email_list):
                email_title = wait.until(EC.presence_of_element_located(
                    (By.XPATH, '//*[@id="frm"]/div[2]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).text
                if email_title == 'Message title name':#Here you can use xpath to find out ~ only open the email with your email title
                    print(email_title)
                    # totime=wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[2]/table[{}]//tr/td[1]/input'.format(index1 + 1)))).get_attribute('totime')
                    emailinfo = wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[2]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).click()
                    time.sleep(1)
                    backemail = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="mainmail"]/div[1]/div[2]/a[1]')))
                    html = browser.page_source
                    response = etree.HTML(html)
                    content = response.xpath('//*[@id="mailContentContainer"]//text()')  # Text content
                    Customer = content[1].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ')
                    blNo = content[2].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ')
                    portdis = content[3].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ')
                    containere = content[4].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ')
                    CustomerNo = Customer[-1]
                    BlNo = blNo[-1]
                    PortDis = portdis[-1]
                    Container_remake = containere[-1]
                    weituoinfo = {
                        "customerNumber": CustomerNo,
                        "shipmentNumber": BlNo,
                        "dischargingPort": PortDis,
                        "boxRemark": Container_remake
                    }
                    print(weituoinfo)
                    hea = {"Content-Type": "application/json"}#Can be removed
                    url = 'The url'#Can be removed
                    response1 = requests.post(url, data=json.dumps(weituoinfo), headers=hea)#Can be removed
                    backemail.click()  # Click back
                    time.sleep(1)
                elif email_title == 'Message title name':#Here you can use xpath to find out ~ only open the email with your email title
                    print(email_title)
                    emailinfo = wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[2]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).click()
                    time.sleep(1)
                    backemail = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="mainmail"]/div[1]/div[2]/a[1]')))
                    html = browser.page_source
                    response = etree.HTML(html)
                    content = response.xpath('//*[@id="mailContentContainer"]//text()')  # Text content
                    Customer = content[1].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ')
                    booking = content[2].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ')
                    blNo = content[3].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ')
                    portdis = content[4].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ')
                    containere = content[5].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(": ")
                    CustomerNo = Customer[-1]
                    Booking = booking[-1]
                    BlNo = blNo[-1]
                    PortDis = portdis[-1]
                    Container_remake = containere[-1]
                    shipbookinginfo = {
                        "customerNumber": CustomerNo,
                        "shipmentNumber":Booking,
                        "blno": BlNo,
                        "dischargingPort":PortDis,
                        "predistributionBox": Container_remake
                    }
                    print(shipbookinginfo)
                    backemail.click()  # Click back
                    time.sleep(1)
            if email_list1!=[]:
                for index1, email in enumerate(email_list):
                    email_title = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="frm"]/div[3]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).text
                    if email_title ==  'Message header name':#Here you can use xpath to find out ~ only open the email with your email title
                        print(email_title)
                        # totime=wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[2]/table[{}]//tr/td[1]/input'.format(index1 + 1)))).get_attribute('totime')
                        emailinfo = wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[3]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).click()
                        time.sleep(1)
                        backemail = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="mainmail"]/div[1]/div[2]/a[1]')))
                        html = browser.page_source
                        response = etree.HTML(html)
                        content = response.xpath('//*[@id="mailContentContainer"]//text()')  # Text content
                        Customer = content[1].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ')
                        blNo = content[2].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ')
                        portdis = content[3].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ')
                        containere = content[4].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ')
                        CustomerNo = Customer[-1]
                        BlNo = blNo[-1]
                        PortDis = portdis[-1]
                        Container_remake = containere[-1]
                        weituoinfo = {
                            "customerNumber": CustomerNo,
                            "shipmentNumber": BlNo,
                            "dischargingPort": PortDis,
                            "boxRemark": Container_remake
                        }
                        print(weituoinfo)
                        backemail.click()  # Click back
                        time.sleep(1)
                    elif email_title ==  'Message title name':#Here you can use xpath to find out ~ only open the email with your email title
                        print(email_title)
                        emailinfo = wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[3]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).click()
                        time.sleep(1)
                        backemail = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="mainmail"]/div[1]/div[2]/a[1]')))
                        html = browser.page_source
                        response = etree.HTML(html)
                        content = response.xpath('//*[@id="mailContentContainer"]//text()')  # Text content
                        Customer = content[1].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ')
                        booking = content[2].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ')
                        blNo = content[3].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ')
                        portdis = content[4].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ')
                        containere = content[5].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(": ")
                        CustomerNo = Customer[-1]
                        Booking = booking[-1]
                        BlNo = blNo[-1]
                        PortDis = portdis[-1]
                        Container_remake = containere[-1]
                        shipbookinginfo = {
                            "customerNumber": CustomerNo,
                            "shipmentNumber":Booking,
                            "blno": BlNo,
                            "dischargingPort":PortDis,
                            "predistributionBox": Container_remake
                        }
                        print(shipbookinginfo)
                        backemail.click()  # Click back
                        time.sleep(1)

            next_pages = wait.until(EC.presence_of_element_located((By.ID, 'nextpage')))  # Turn page prevpage#nextpage
            next_pages.click()
            browser.switch_to.default_content()
            # browser.switch_to.frame("mainFrame")
            if next_pages == []:
                break

8. If you still don't understand, you can send me hhhhhhhhhh hhhhhhhh online order?? Ha ha ha ha~

#Run this to start~
cmdline.execute("scrapy crawl Temail".split())

Posted by wee_eric on Sun, 21 Jun 2020 22:42:48 -0700