Using selenium to crawl dynamically rendered pages

Keywords: Big Data Selenium Python Javascript Google

Explain

For some websites, their pages are not loaded with HTML tags, but rendered with javascript. For such webpages, it is not feasible to parse only by regular expressions and XPath. For such web pages, we can analyze the ajax requests, analyze the ajax parameters to find its rules, and simulate the ajax requests by ourselves (as mentioned before for how to use the ajax parameters to crawl web data bloggers). Second, if we can not find the rules through the ajax parameters, we can use selenium to simulate the browser, which means that we use selenium and chrome driver. You can use code to simulate user interaction on the browser.

Analysis

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
import re
from urllib.parse import quote
import csv
from pyquery import PyQuery
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser=webdriver.Chrome()  #The browser used is Google Browser
wait=WebDriverWait(browser,3)  #Set the maximum waiting time for the response to be 3 seconds
KEYWORD='python'   #Extract keywords to crawl
#Define a function to crawl in the crawl page number
def get_one_page(page):
    print('Climbing for the first place'+str(page)+'page')
    #quote function encoding Chinese url to prevent scrambling
    url='https://s.taobao.com/search?q='+quote(KEYWORD)
    try:
        browser.get(url)   #get way to request web pages
        if page>1:         #If the page number is greater than 1, skip the page first
            input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.form .input.J_Input'))) #Get the jump-page input box
            submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.form .btn.J_Submit')))#Get the confirmation button
            input.clear()  #Simulated empty page number
            input.send_keys(page) #Analog input page number
            submit.click()  #Simulated click operation
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'ul.items>li.item.active>span.num'),str(page))) #Confirm that you have jumped to the target page number
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item')))   #Waiting for the list of items to load
        #Resolve to get a list of goods
        html=browser.page_source  #Get the source code for this page
        doc=PyQuery(html)         #Generating PyQuery objects
        items=doc('.m-itemlist .items .item').items()   #Generator for Getting List of Goods
        for item in items:
            price=item.find('.price').text()   #Get the price of each item
            price=re.sub('¥\n','',price)       #Processing the price obtained
            product={
                'store':item.find('.shop').text(),
                'image':item.find('.pic .img').attr('data-src'),
                'price':price
            }                                   #Store stores, picture URLs and prices in a dictionary.
            print(product)
            yield product                      #Add a dictionary of each item to the generator
    except TimeoutException:
        get_one_page(page)     #Request timeout to call this method again

if __name__=='__main__':
    with open('result.csv','a',newline='') as csvfile:   #Open the local csv file in an additional way, and the newline parameter is used to remove blank lines
        filenames=['store','image','price']              #Header
        csvwriter=csv.DictWriter(csvfile,filenames)
        csvwriter.writeheader()                            #Writing table
        for page in range(1,101):                          #Loop 100 generators (100 pages)
            products=get_one_page(page)                    #Get all the goods on each page.
            for product in products:                      #Goods traversing every page
                csvwriter.writerow(product)                #The information of each commodity is written in csv
    print('Crawl finish')

Because I am lazy, I have made a detailed explanation in the code section directly, and I will not repeat it here!

Posted by slug58 on Wed, 23 Jan 2019 06:18:13 -0800

Programmer Group

Using selenium to crawl dynamically rendered pages

Explain

Analysis

Hot Keywords