Climbing the information of millet

Keywords: Python encoding Selenium Excel


Climbing millet is good:

Put the two links together and run once to get them all (about 700)
Using a combination of selenium+chrome+lxml
 (Quickly, because it's just one page)


The program generates three files, two CSVS and one xls
 csv is compact and versatile
 data_mi.csv uses utf-8 encoding.
data_mi-gbk.csv uses GBK encoding.
xls is the format of excel

(gbk is Chinese encoding, it can be opened by excel only, utf-8 is python default encoding, and can be opened by professional tools).

Attach code

from selenium import webdriver
from import WebDriverWait
from import By
from import expected_conditions as EC
from import Options
from lxml import etree
import csv
import xlwt
import time

class Spider:
    def __init__(self):
        self.runtime = None
        self.url = [
        self.csvfilename = 'data_mi.csv'
        self.csvfilenamegbk= 'data_mi-gbk.csv'
        chrome_options = Options()
        chrome_options.add_argument('--headless')   #Setting chrome headless mode
        self.browser = webdriver.Chrome(chrome_options=chrome_options)
        self.wait = WebDriverWait(self.browser, 20)

    def run(self):
        //Operation entry
        start = time.time()

        #Links to 8.2
        for item in self.parse_page(self.get_page(self.url[0])):

        #Link No. 8.1
        for item in self.parse_page(self.get_page(self.url[1])):

        end = time.time()
        self.runtime = end - start

    def get_page(self, url):
        //Request page
            (By.XPATH, '//*[@id="root"]/div/div[3]/div/div[2]/div/div[1]/div[1]/img')))
        # Analog drop down
        for i in range(50):
            js_to_buttom = "window.scrollBy(0,1000)"
        # Waiting for Web Page Loading

        return self.browser.page_source

    def parse_page(self, text):
        //Analytic web page
        html = etree.HTML(text)
        for index in range(2, 17):
            classes = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/h2/text()'.format(index))[0]

            names = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[1]/text()'.format(index))
            introduces = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[2]/text()'.format(index))
            prices = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[3]/span[2]/text()'.format(index))
            imgs = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/div[1]/img/@src'.format(index))

            if len(names) != len(introduces) != len(prices) != len(imgs):
                raise Exception
            for i in range(len(names)):
                yield [classes, names[i], introduces[i], prices[i], imgs[i]]

    def save_data(self, item):
        //Save file
        with open(self.csvfilename, 'a', encoding='utf-8', newline='') as csvfile:
            print('item >>> ', item)
            writer = csv.writer(csvfile)

    def u8togbk(self,infn,outfn):
        with open(infn, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)

            results = list(reader)

        with open(outfn, 'w', encoding='gbk', newline='') as f:
            writer = csv.writer(f)
            for result in results:
                except Exception:

    def mkxls(self, out_filename):
        csv Convert to xls file

        def csv_to_xlsx(csvfile, outfile):
            :param csvfile: str
            :param outfile: str
            :return: None
            with open(csvfile) as fc:
                r_csv = csv.reader(fc)
                workbook = xlwt.Workbook()
                sheet = workbook.add_sheet('sheet1')  # Create a sheet table
                i = 0
                j = 0
                for line in r_csv:
                    j = 0
                    for v in line:
                        sheet.write(i, j, v)
                        j = j + 1
                    i = i + 1
        # Save Excel

        csv_to_xlsx(self.csvfilenamegbk, out_filename)

    def time(self):
        return 'Total time:{}second'.format(self.runtime)

if __name__ == '__main__':
    spider = Spider()  # Run crawler
    spider.mkxls('data_mi.xls')  # In this line, csv file conversion bits xls file can be opened and used by excel
    print(spider.time)  # Total running time

Output file list

Output file format

Note: the web page is encoded by utf-8. If it is saved to gbk, some codes do not support and can only be abandoned, so the content of utf-8 encoding will be about 2-3% more than that of gbk.

Posted by Jewbilee on Tue, 08 Oct 2019 23:18:24 -0700