Explain
For some websites, their pages are not loaded with HTML tags, but rendered with javascript. For such webpages, it is not feasible to parse only by regular expressions and XPath. For such web pages, we can analyze the ajax requests, analyze the ajax parameters to find its rules, and simulate the ajax requests by ourselves (as mentioned before for how to use the ajax parameters to crawl web data bloggers). Second, if we can not find the rules through the ajax parameters, we can use selenium to simulate the browser, which means that we use selenium and chrome driver. You can use code to simulate user interaction on the browser.
Analysis
#!/usr/bin/env python # -*- coding:utf-8 -*- import re from urllib.parse import quote import csv from pyquery import PyQuery from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC browser=webdriver.Chrome() #The browser used is Google Browser wait=WebDriverWait(browser,3) #Set the maximum waiting time for the response to be 3 seconds KEYWORD='python' #Extract keywords to crawl #Define a function to crawl in the crawl page number def get_one_page(page): print('Climbing for the first place'+str(page)+'page') #quote function encoding Chinese url to prevent scrambling url='https://s.taobao.com/search?q='+quote(KEYWORD) try: browser.get(url) #get way to request web pages if page>1: #If the page number is greater than 1, skip the page first input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.form .input.J_Input'))) #Get the jump-page input box submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.form .btn.J_Submit')))#Get the confirmation button input.clear() #Simulated empty page number input.send_keys(page) #Analog input page number submit.click() #Simulated click operation wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'ul.items>li.item.active>span.num'),str(page))) #Confirm that you have jumped to the target page number wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item'))) #Waiting for the list of items to load #Resolve to get a list of goods html=browser.page_source #Get the source code for this page doc=PyQuery(html) #Generating PyQuery objects items=doc('.m-itemlist .items .item').items() #Generator for Getting List of Goods for item in items: price=item.find('.price').text() #Get the price of each item price=re.sub('¥\n','',price) #Processing the price obtained product={ 'store':item.find('.shop').text(), 'image':item.find('.pic .img').attr('data-src'), 'price':price } #Store stores, picture URLs and prices in a dictionary. print(product) yield product #Add a dictionary of each item to the generator except TimeoutException: get_one_page(page) #Request timeout to call this method again if __name__=='__main__': with open('result.csv','a',newline='') as csvfile: #Open the local csv file in an additional way, and the newline parameter is used to remove blank lines filenames=['store','image','price'] #Header csvwriter=csv.DictWriter(csvfile,filenames) csvwriter.writeheader() #Writing table for page in range(1,101): #Loop 100 generators (100 pages) products=get_one_page(page) #Get all the goods on each page. for product in products: #Goods traversing every page csvwriter.writerow(product) #The information of each commodity is written in csv print('Crawl finish')
Because I am lazy, I have made a detailed explanation in the code section directly, and I will not repeat it here!