1. Recently, the manager was annoyed by the email, maybe too much, and wanted to read in batches and process the required data and send it to his interface, but Tencent's js was too many, too difficult, too much to read, too much to understand, manual dog head. So I'm going to use selenium to simulate the browser
2. Because it needs to crawl regularly every day, the framework is used here. In fact, the main function of,,, is for the convenience of timing. Just publish directly. There was a timing framework before, so we used the method of "scratch". If you are not a regular friend, just use the way you like
3. Import some libraries first, it seems that some are not used.. If it's gray, just remove it or comment it~
import json import re import time import requests import scrapy from django.http import request, HttpResponse from lxml import etree from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver import ChromeOptions, DesiredCapabilities
4. Here's the crawler code. Hum, don't spray me. Write it at start_ It's more convenient in requests... Don't ask me why.. I'm lazy.. But I'll give you some notes~
5. There is a pit here. Notice the above browser.switch_to.frame("mainFrame"), you must jump to this iframe here, otherwise you will report an error if you can't find the next element, and it's also necessary to wait for 3 seconds, otherwise you can't find it
class TEMAILSpider(scrapy.Spider): name = 'Temail' allowed_domains = [] start_urls = [] custom_settings = {'CONCURRENT_REQUESTS': '10', }#Number of threads~ def chuliinfo(self, shuju): shuju = ''.join(shuju) shuju = shuju.replace('\n', '').replace(' ', '') return shuju def start_requests(self): option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) option.add_experimental_option('useAutomationExtension', False) # It's much faster to open headless here # option.add_argument('--headless') option.add_argument('--disable-gpu') # This is to disable loading and speed up # prefs = { # 'profile.default_content_setting_values': { # 'images': 2,#2 this state is to disable loading pictures # # # 'javascript': 2 # } # } # option.add_experimental_option('prefs', prefs) # Google browser desired_capabilities = DesiredCapabilities.CHROME desired_capabilities["pageLoadStrategy"] = "none" browser = webdriver.Chrome(options=option, desired_capabilities=desired_capabilities) browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'}) browser.get("https://exmail.qq.com/login") # get accept url can be any url, here take Baidu as an example wait = WebDriverWait(browser, 30) element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="loginForm"]/div[3]/div[3]/a[1]'))).click() # Its role is now, and so on!!!!!!! input_user = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="inputuin"]'))).send_keys( 'Enter your account here') # Enter account number input_pwd = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="pp"]'))).send_keys('Enter your password here') # Input password bt_login = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="btlogin"]'))).click() # Click the login button # Here are the waiting times for your inbox # time.sleep(6) folder = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="folder_1"]'))).click() # Click inbox browser.switch_to.frame("mainFrame") # Locate recipients time.sleep(3)
6. The following code can be added or not, you can directly select the number of pages you want to jump, otherwise it is from the first page. It is recommended to try the operation~
#The next one is for page skipping maillistjump = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="maillistjump"]'))).click() # Jump jumppage = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="list"]/div[1]//input'))).send_keys( '10') # Enter account number # Jump pages mljump = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="list"]/div[1]//a'))).click() # Jump button browser.switch_to.default_content() browser.switch_to.frame("mainFrame") time.sleep(3)
7. The next step is the key analysis and circulation, which is one of them time.sleep I'm crying. Whoa
# print(len(email_list)) for i in range(1000): if i != 0: browser.switch_to.frame("mainFrame") time.sleep(3) email_list = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="frm"]/div[2]/table'))) # All messages on one page # email_list1 = WebDriverWait(browser, 5).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@ id="frm"]/div[3]/table ')) (another case) email_list1=browser.find_elements_by_xpath('//*[@id="frm"]/div[3]/table') # email_list = browser.find_elements_by_xpath('//*[@id="frm"]/div[2]/table') # //tr/td[3]/table//tr/td[3]/div[1]/u # email_title =browser.find_element_by_xpath('//*[@id="frm"]/div[2]/table//tr/td[3]/table//tr/td[3]/div[1]/u').text for index1, email in enumerate(email_list): email_title = wait.until(EC.presence_of_element_located( (By.XPATH, '//*[@id="frm"]/div[2]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).text if email_title == 'Message title name':#Here you can use xpath to find out ~ only open the email with your email title print(email_title) # totime=wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[2]/table[{}]//tr/td[1]/input'.format(index1 + 1)))).get_attribute('totime') emailinfo = wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[2]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).click() time.sleep(1) backemail = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="mainmail"]/div[1]/div[2]/a[1]'))) html = browser.page_source response = etree.HTML(html) content = response.xpath('//*[@id="mailContentContainer"]//text()') # Text content Customer = content[1].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ') blNo = content[2].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ') portdis = content[3].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ') containere = content[4].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ') CustomerNo = Customer[-1] BlNo = blNo[-1] PortDis = portdis[-1] Container_remake = containere[-1] weituoinfo = { "customerNumber": CustomerNo, "shipmentNumber": BlNo, "dischargingPort": PortDis, "boxRemark": Container_remake } print(weituoinfo) hea = {"Content-Type": "application/json"}#Can be removed url = 'The url'#Can be removed response1 = requests.post(url, data=json.dumps(weituoinfo), headers=hea)#Can be removed backemail.click() # Click back time.sleep(1) elif email_title == 'Message title name':#Here you can use xpath to find out ~ only open the email with your email title print(email_title) emailinfo = wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[2]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).click() time.sleep(1) backemail = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="mainmail"]/div[1]/div[2]/a[1]'))) html = browser.page_source response = etree.HTML(html) content = response.xpath('//*[@id="mailContentContainer"]//text()') # Text content Customer = content[1].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ') booking = content[2].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ') blNo = content[3].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ') portdis = content[4].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ') containere = content[5].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(": ") CustomerNo = Customer[-1] Booking = booking[-1] BlNo = blNo[-1] PortDis = portdis[-1] Container_remake = containere[-1] shipbookinginfo = { "customerNumber": CustomerNo, "shipmentNumber":Booking, "blno": BlNo, "dischargingPort":PortDis, "predistributionBox": Container_remake } print(shipbookinginfo) backemail.click() # Click back time.sleep(1) if email_list1!=[]: for index1, email in enumerate(email_list): email_title = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="frm"]/div[3]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).text if email_title == 'Message header name':#Here you can use xpath to find out ~ only open the email with your email title print(email_title) # totime=wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[2]/table[{}]//tr/td[1]/input'.format(index1 + 1)))).get_attribute('totime') emailinfo = wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[3]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).click() time.sleep(1) backemail = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="mainmail"]/div[1]/div[2]/a[1]'))) html = browser.page_source response = etree.HTML(html) content = response.xpath('//*[@id="mailContentContainer"]//text()') # Text content Customer = content[1].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ') blNo = content[2].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ') portdis = content[3].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ') containere = content[4].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(': ') CustomerNo = Customer[-1] BlNo = blNo[-1] PortDis = portdis[-1] Container_remake = containere[-1] weituoinfo = { "customerNumber": CustomerNo, "shipmentNumber": BlNo, "dischargingPort": PortDis, "boxRemark": Container_remake } print(weituoinfo) backemail.click() # Click back time.sleep(1) elif email_title == 'Message title name':#Here you can use xpath to find out ~ only open the email with your email title print(email_title) emailinfo = wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="frm"]/div[3]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 + 1)))).click() time.sleep(1) backemail = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="mainmail"]/div[1]/div[2]/a[1]'))) html = browser.page_source response = etree.HTML(html) content = response.xpath('//*[@id="mailContentContainer"]//text()') # Text content Customer = content[1].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ') booking = content[2].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ') blNo = content[3].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ') portdis = content[4].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(': ') containere = content[5].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(": ") CustomerNo = Customer[-1] Booking = booking[-1] BlNo = blNo[-1] PortDis = portdis[-1] Container_remake = containere[-1] shipbookinginfo = { "customerNumber": CustomerNo, "shipmentNumber":Booking, "blno": BlNo, "dischargingPort":PortDis, "predistributionBox": Container_remake } print(shipbookinginfo) backemail.click() # Click back time.sleep(1) next_pages = wait.until(EC.presence_of_element_located((By.ID, 'nextpage'))) # Turn page prevpage#nextpage next_pages.click() browser.switch_to.default_content() # browser.switch_to.frame("mainFrame") if next_pages == []: break
8. If you still don't understand, you can send me hhhhhhhhhh hhhhhhhh online order?? Ha ha ha ha~
#Run this to start~ cmdline.execute("scrapy crawl Temail".split())