Using python+selenium to crawl patents on derwent database
Demand:
Login to web of science and enter derwent database, search according to company excel list, and download all search ethics records to the local designated path.
Train of thought: It is similar to the steps of climbing cortellis, and also uses selenium to simulate browsers. The main simulation involved are: pull-down menu, button, input box. Generally speaking, it is not complicated.
However, it should be noted that if the derwent search result is 0, it will return an error page. When this happens, it can be caught by a try-except and refreshed once with selenium.
The disadvantage is that try-except is widely used, and there is no good solution. The randomness of this database is too large.
Here is the code:
from selenium import webdriver import sys,io from selenium.webdriver.common.by import By import time import xlrd import json import re,os import requests from selenium.webdriver.support.ui import Select from openpyxl import Workbook import pickle def read_company(fileName): bk=xlrd.open_workbook(fileName) shxrange=range(bk.nsheets) try: sh=bk.sheet_by_name("Sheet1") except: print ("Code error") book = Workbook(encoding='utf-8') UPC = [] tmp1 = sh.col_values(0)[1:] #company return tmp1 #Get the cookie for the current page def get_current_cookie(): cookie = browser.get_cookies() jsonCookies = json.dumps(cookie) #Splicing cookie_current = [item['name'] + '=' + item['value'] for item in cookie] #Note here that each field should be separated by ";", otherwise it will be invalid. cookiestr = '; '.join(item for item in cookie_current) return cookiestr #Setting Download Path def set_chrome_pref(path): chromeOptions = webdriver.ChromeOptions() prefs = {"download.default_directory": path} chromeOptions.add_experimental_option("prefs", prefs) time.sleep(1) return chromeOptions # from openpyxl import Workbook as wb #Prevent command line unrecognizability sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') fileName0 = './20190710_Zhangbo_Firm list.xlsx' print('load company name...') company_list = read_company(fileName0) flag = 0 #First run for company in company_list: print('search company patent for: ' + company) download_path = "G:\\spider_study\\selenium\\derwent_result\\" + company + '\\' isExists = os.path.exists(download_path) if not isExists: os.makedirs(download_path) tmp = set_chrome_pref(download_path) #Define browser carrier browser = webdriver.Chrome(chrome_options=tmp) browser.get("http://apps.webofknowledge.com/DIIDW_GeneralSearch_input.do?product=DIIDW&search_mode=GeneralSearch&SID=7FGhAhNWvHXPFIknaix&preferencesSaved=") #Drop-down menu to select derwent database switch_database = browser.find_element_by_class_name('select2') switch_database = switch_database.find_element_by_class_name('select2-selection') switch_database.click() derwent_option = browser.find_elements_by_class_name('select2-results__option')[4] derwent_option.click() #Selection of Authorizer assignee_option = browser.find_element_by_id('select2-select1-container') assignee_option.click() assignee_option = browser.find_elements_by_class_name('select2-results__option')[3] assignee_option.click() # #Location serach Module search_block = browser.find_element_by_class_name('block-search-content') search_block = search_block.find_element_by_class_name('focusinput') time.sleep(1) search_block.send_keys(company) # # search_button = browser.find_elements_by_class_name('large-button')[0] search_button.click() #You must switch_to after clicking the button to locate the successful new window browser.switch_to.window(browser.window_handles[0]) time.sleep(10) try: items_per_page = browser.find_element_by_id('select2-selectPageSize_bottom-container') items_per_page.click() items_50_per_page = browser.find_elements_by_class_name('select2-results__option')[-1] items_50_per_page.click() time.sleep(10) except: browser.quit() continue #Get the total number of pages try: page_num = browser.find_element_by_id('pageCount.top').text print('There are ' + str(page_num) + ' pages for this company!') except: browser.refresh(); for i in range(int(page_num)): #Select all patents on the current page try: select_all = browser.find_element_by_id('SelectPageChkId') select_all.click() except: browser.navigate().refresh(); if flag == 0: #If it is the first time to open the browser #Select export format export_type = browser.find_elements_by_id('exportTypeName')[0] export_type.click() other_file = browser.find_elements_by_class_name('quickOutputOther')[0] other_file.click() #Dropdown menu selection format all_record = browser.find_element_by_id('numberOfRecordsAllOnPage') all_record.click() record_content = browser.find_element_by_id('select2-bib_fields-container') record_content.click() full_record = browser.find_elements_by_class_name('select2-results__option')[-1] full_record.click() file_type = browser.find_element_by_id('select2-saveOptions-container') file_type.click() tab_file = browser.find_elements_by_class_name('select2-results__option')[3] tab_file.click() flag = 1 else: #If the output format has been saved (not opened for the first time) #Click the export button export_block = browser.find_elements_by_class_name('onload-secondary-button')[0] export_block.click() time.sleep(1) #Export Download export_button = browser.find_elements_by_class_name('onload-primary-button')[-1] export_button.click() time.sleep(3) #Close the download page close_button = browser.find_element_by_class_name('flat-button') close_button.click() time.sleep(1) #Click on the next page nextpage_button = browser.find_elements_by_class_name('paginationNext')[0] nextpage_button.click() browser.switch_to.window(browser.window_handles[0]) time.sleep(8) browser.quit() flag = 0 #Initialization