Explain
Climbing millet is good:
Put the two links together and run once to get them all (about 700) Using a combination of selenium+chrome+lxml (Quickly, because it's just one page)
Output:
The program generates three files, two CSVS and one xls csv is compact and versatile data_mi.csv uses utf-8 encoding. data_mi-gbk.csv uses GBK encoding. xls is the format of excel (gbk is Chinese encoding, it can be opened by excel only, utf-8 is python default encoding, and can be opened by professional tools).
Attach code
from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from lxml import etree import csv import xlwt import time class Spider: def __init__(self): self.runtime = None self.url = [ 'https://www.xiaomiyoupin.com/goodsbycategory?firstId=115&secondId=115&title=%E5%AE%B6%E7%94%A8%E7%94%B5%E5%99%A8&spmref=YouPinPC.$Home$.list.0.90827029', 'https://www.xiaomiyoupin.com/goodsbycategory?firstId=116&secondId=116&title=%E6%99%BA%E8%83%BD%E5%AE%B6%E5%BA%AD&spmref=YouPinPC.$Home$.list.0.93586205' ] self.csvfilename = 'data_mi.csv' self.csvfilenamegbk= 'data_mi-gbk.csv' chrome_options = Options() chrome_options.add_argument('--headless') #Setting chrome headless mode self.browser = webdriver.Chrome(chrome_options=chrome_options) self.wait = WebDriverWait(self.browser, 20) def run(self): ''' //Operation entry ''' start = time.time() #Links to 8.2 for item in self.parse_page(self.get_page(self.url[0])): self.save_data(item) #Link No. 8.1 for item in self.parse_page(self.get_page(self.url[1])): self.save_data(item) self.u8togbk(self.csvfilename,self.csvfilenamegbk) end = time.time() self.runtime = end - start def get_page(self, url): ''' //Request page ''' self.browser.get(url) self.wait.until(EC.presence_of_element_located( (By.XPATH, '//*[@id="root"]/div/div[3]/div/div[2]/div/div[1]/div[1]/img'))) # Analog drop down for i in range(50): js_to_buttom = "window.scrollBy(0,1000)" self.browser.execute_script(js_to_buttom) time.sleep(0.05) # Waiting for Web Page Loading time.sleep(5) return self.browser.page_source def parse_page(self, text): ''' //Analytic web page ''' html = etree.HTML(text) for index in range(2, 17): classes = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/h2/text()'.format(index))[0] names = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[1]/text()'.format(index)) introduces = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[2]/text()'.format(index)) prices = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[3]/span[2]/text()'.format(index)) imgs = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/div[1]/img/@src'.format(index)) if len(names) != len(introduces) != len(prices) != len(imgs): raise Exception print(len(names),len(introduces),len(prices),len(imgs)) for i in range(len(names)): yield [classes, names[i], introduces[i], prices[i], imgs[i]] def save_data(self, item): ''' //Save file ''' with open(self.csvfilename, 'a', encoding='utf-8', newline='') as csvfile: print('item >>> ', item) writer = csv.writer(csvfile) writer.writerow(item) def u8togbk(self,infn,outfn): with open(infn, 'r', encoding='utf-8') as f: reader = csv.reader(f) results = list(reader) with open(outfn, 'w', encoding='gbk', newline='') as f: writer = csv.writer(f) for result in results: try: writer.writerow(result) except Exception: pass def mkxls(self, out_filename): ''' csv Convert to xls file ''' def csv_to_xlsx(csvfile, outfile): ''' :param csvfile: str :param outfile: str :return: None ''' with open(csvfile) as fc: r_csv = csv.reader(fc) workbook = xlwt.Workbook() sheet = workbook.add_sheet('sheet1') # Create a sheet table i = 0 j = 0 for line in r_csv: j = 0 for v in line: sheet.write(i, j, v) j = j + 1 i = i + 1 workbook.save(outfile) # Save Excel csv_to_xlsx(self.csvfilenamegbk, out_filename) @property def time(self): return 'Total time:{}second'.format(self.runtime) if __name__ == '__main__': spider = Spider() spider.run() # Run crawler spider.mkxls('data_mi.xls') # In this line, csv file conversion bits xls file can be opened and used by excel print(spider.time) # Total running time
Output file list
Output file format
Note: the web page is encoded by utf-8. If it is saved to gbk, some codes do not support and can only be abandoned, so the content of utf-8 encoding will be about 2-3% more than that of gbk.