Grabbing content: public comments - Beijing - Haidian District - Gourmet Merchants - comments in recent three months
1. Required configuration: chrome browser, selenium package of python, selenium
Start selenium, execute the following code, and then you can see a new chrome icon. It's a test browser, where you can log in to the account of the public comment (crawling Part 2, information needs to be logged in, it's inconvenient to execute in the program, and you can get the cookie once you log in)
3. Those who need reptile service can contact Q, 739848314.
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium import webdriver import time import pandas as pd import json #Put the file path and file name of the chromedriver CHROME_DRIVER = 'C:\\Users\\Administrator\\Desktop\\chromedriver_win32\\chromedriver.exe' '''# Create chrome parameter object. This section can improve the crawler speed as much as possible according to your own needs. opt = webdriver.ChromeOptions() opt.add_argument('blink-settings=imagesEnabled=false') #Don't load pictures, speed up options=opt)''' #Set browser parameters driver = webdriver.Chrome(executable_path=CHROME_DRIVER) #The href file is the link library to be crawled. If you only need to crawl one file, you only need to set the href as the web address. href=pd.read_csv('mer_info.csv',encoding='GBK') href = href ['href'] driver.get(href[0]) #Open the website, log in to the public comment manually, and then execute the following sentence
4. Obtain the cookie, which can be saved in the file or read directly.
#Get cookies cookies = driver.get_cookies() '''f1 = open('cookie.txt', 'w') f1.write(json.dumps(cookies)) f1.close f1 = open('cookie.txt') cookie = f1.read() cookie =json.loads(cookie)'''
5. Crawler program
#Load the cookie, and the loaded interface will log in automatically. driver.add_cookie(cookies[0]) #Grab the label. The custom function x is the returned list, y is the xpath path of the content, and z is the attribute to get. def get_tag(x,y,z): xpath = y doc = driver.find_elements_by_xpath(xpath) x=[c.get_attribute(z) for c in doc] #x = x+b return x #Define some empty lists #list(range(0,len(href)) user_id = [] comment_time = [] comennt1 = [] comment2=[] #Change the parameter in the range below, 100-200200-300300-400400-500500-600600-len(href) for i in list(range(0,len(href)): #Load the website, the website of public comment is very regular. In addition to the homepage website, the merchant address is the shop / merchant number, and the user address is the shop / merchant number. #member / user number, search is the transcoding of search character url = href[i] + '/review_all' driver.get(url) #Comments page loop for m in list(range(0,100)): #Choose what to comment on at a specific time comment_time = driver.find_elements_by_class_name('time') comment_time = [n.text.split(' ')[0] for n in comment_time] x = pd.DataFrame({'time':comment_time}) x['time']=pd.to_datetime(x['time'],format = '%Y-%m-%d %H:%M') #If the number of comments on this page is less than 2 after July 27, 2018, it will not be retrieved. if sum(x['time'] > pd.to_datetime('2018-7-27 15:00',format = '%Y-%m-%d %H:%M'))<=2: break else: #Grab the comment user's id user_id = get_tag(user_id,'//*[@id="reviewlist"]/div[2]/div[1]/div[3]/div[3]/ul/li/div/div[1]/a','href') #Automatically click to expand comments for j in list(range(1,25)): try: driver.find_element_by_link_text('Commenting').click() except: break #Grabbing the number of words in the user's comments. If you want to comment, you only need to remove the len function, but the public comments randomly replace the words with pictures, and there will be many deficiencies in the captured comments. comment1 = driver.find_elements_by_xpath('//*[@id="review-list"]/div[2]/div[1]/div[3]/div[3]/ul/li/div/div[3]') comment2 = driver.find_elements_by_xpath('//*[@id="review-list"]/div[2]/div[1]/div[3]/div[3]/ul/li/div/div[4]') if len(comment2)==0: comment1 = driver.find_elements_by_xpath('//*[@id="review-list"]/div[2]/div[1]/div[3]/div[2]/ul/li/div/div[3]') comment2 = driver.find_elements_by_xpath('//*[@id="review-list"]/div[2]/div[1]/div[3]/div[2]/ul/li/div/div[4]') else: pass comment_len1 = [len(k.text) for k in comment1] comment_len2 = [len(k.text) for k in comment2] try: #Data splicing into data frame y = pd.DataFrame({'usre_id':user_id, 'time':x['time'], 'comment_len1':comment_len1, 'comment_len2':comment_len2}) y['mer_href'] = href[i] ############The following i must be changed to the same as the above starting value, otherwise an error will be reported. if i == 0 &m==0: mer_comment_info_0 = y else: mer_comment_info_0 = pd.concat([mer_comment_info_0,y],axis = 0) mer_comment_info_0.to_csv('D:\\mer_comment_info156-667.csv') #One page of comments is not added. Output one character. Check the progress at any time. print(str(i)+'read'+str(m)) except: #If an error occurs, do not stop the program, and output the wrong page number. print(str(i)+'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'+str(m)) pass try: #Page turning driver.find_element_by_link_text('next page').click() except: #If the last page, go to the next href break