Using python+selenium to crawl patents on derwent database

Keywords: Selenium Database JSON encoding

Using python+selenium to crawl patents on derwent database

Demand:
Login to web of science and enter derwent database, search according to company excel list, and download all search ethics records to the local designated path.

Train of thought: It is similar to the steps of climbing cortellis, and also uses selenium to simulate browsers. The main simulation involved are: pull-down menu, button, input box. Generally speaking, it is not complicated.
However, it should be noted that if the derwent search result is 0, it will return an error page. When this happens, it can be caught by a try-except and refreshed once with selenium.

The disadvantage is that try-except is widely used, and there is no good solution. The randomness of this database is too large.

Here is the code:

from selenium import webdriver
import sys,io
from selenium.webdriver.common.by import By
import time
import xlrd
import json
import re,os
import requests
from selenium.webdriver.support.ui import Select
from openpyxl import Workbook
import pickle


def read_company(fileName):
    bk=xlrd.open_workbook(fileName)
    shxrange=range(bk.nsheets)
    try:
        sh=bk.sheet_by_name("Sheet1")
    except:
        print ("Code error")
    book = Workbook(encoding='utf-8')
    UPC = []
    tmp1 = sh.col_values(0)[1:]  #company
    return tmp1
#Get the cookie for the current page
def get_current_cookie():
	cookie = browser.get_cookies()
	jsonCookies = json.dumps(cookie)
	#Splicing
	cookie_current = [item['name'] + '=' + item['value'] for item in cookie]
	#Note here that each field should be separated by ";", otherwise it will be invalid.
	cookiestr = '; '.join(item for item in cookie_current)
	return cookiestr

#Setting Download Path
def set_chrome_pref(path):
    chromeOptions = webdriver.ChromeOptions()
    prefs = {"download.default_directory": path}
    chromeOptions.add_experimental_option("prefs", prefs)
    time.sleep(1)
    return chromeOptions

# from openpyxl import Workbook as wb

#Prevent command line unrecognizability
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
fileName0 = './20190710_Zhangbo_Firm list.xlsx'

print('load company name...')
company_list = read_company(fileName0)
flag = 0 #First run
for company in company_list:
	print('search company patent for: ' + company)
	download_path = "G:\\spider_study\\selenium\\derwent_result\\" + company + '\\'
	isExists = os.path.exists(download_path) 
	if not isExists:
		os.makedirs(download_path)
	tmp = set_chrome_pref(download_path)
	#Define browser carrier
	browser = webdriver.Chrome(chrome_options=tmp)
	browser.get("http://apps.webofknowledge.com/DIIDW_GeneralSearch_input.do?product=DIIDW&search_mode=GeneralSearch&SID=7FGhAhNWvHXPFIknaix&preferencesSaved=")

	#Drop-down menu to select derwent database
	switch_database = browser.find_element_by_class_name('select2')
	switch_database = switch_database.find_element_by_class_name('select2-selection')
	switch_database.click()

	derwent_option = browser.find_elements_by_class_name('select2-results__option')[4]
	derwent_option.click()

	#Selection of Authorizer
	assignee_option = browser.find_element_by_id('select2-select1-container')
	assignee_option.click()
	assignee_option = browser.find_elements_by_class_name('select2-results__option')[3]
	assignee_option.click()

	# #Location serach Module
	search_block = browser.find_element_by_class_name('block-search-content')
	search_block = search_block.find_element_by_class_name('focusinput')
	time.sleep(1)
	search_block.send_keys(company)
	# #
	search_button = browser.find_elements_by_class_name('large-button')[0]
	search_button.click()

	#You must switch_to after clicking the button to locate the successful new window
	browser.switch_to.window(browser.window_handles[0])
	time.sleep(10)
	try:
		items_per_page = browser.find_element_by_id('select2-selectPageSize_bottom-container')
		items_per_page.click()
		items_50_per_page = browser.find_elements_by_class_name('select2-results__option')[-1]
		items_50_per_page.click()
		time.sleep(10)
	except:
		browser.quit()
		continue
		
	#Get the total number of pages
	try:
		page_num = browser.find_element_by_id('pageCount.top').text
		print('There are ' + str(page_num) + ' pages for this company!')
	except:
		browser.refresh();



	for i in range(int(page_num)):

		#Select all patents on the current page
		try:
			select_all = browser.find_element_by_id('SelectPageChkId')
			select_all.click()		
		except:
			browser.navigate().refresh();


		if flag == 0: #If it is the first time to open the browser
			#Select export format
			export_type = browser.find_elements_by_id('exportTypeName')[0]
			export_type.click()
			other_file = browser.find_elements_by_class_name('quickOutputOther')[0]
			other_file.click()


			#Dropdown menu selection format
			all_record = browser.find_element_by_id('numberOfRecordsAllOnPage')
			all_record.click()

			record_content = browser.find_element_by_id('select2-bib_fields-container')
			record_content.click()
			full_record = browser.find_elements_by_class_name('select2-results__option')[-1]
			full_record.click()



			file_type = browser.find_element_by_id('select2-saveOptions-container')
			file_type.click()
			tab_file = browser.find_elements_by_class_name('select2-results__option')[3]
			tab_file.click()
			flag = 1

		else: #If the output format has been saved (not opened for the first time)
			#Click the export button
			export_block = browser.find_elements_by_class_name('onload-secondary-button')[0]
			export_block.click()
			time.sleep(1)

		#Export Download
		export_button = browser.find_elements_by_class_name('onload-primary-button')[-1]
		export_button.click()
		time.sleep(3)


		#Close the download page
		close_button = browser.find_element_by_class_name('flat-button')
		close_button.click()
		time.sleep(1)

		#Click on the next page
		nextpage_button = browser.find_elements_by_class_name('paginationNext')[0]
		nextpage_button.click()
		browser.switch_to.window(browser.window_handles[0])
		time.sleep(8)
	browser.quit()
	flag = 0 #Initialization


Posted by SJones on Thu, 03 Oct 2019 15:15:12 -0700