Crawl Taobao commodity information

Keywords: Python

I believe that after learning python crawling, many people want to crawl some websites with a large amount of data. Taobao is a good target. It has a large amount of data, a wide variety of types and is not very difficult, and it is very suitable for beginners to crawl.Here is the entire crawl process:

Step 1: Build a url for access

#Build a url for access
    goods = "Fishtail skirt"
    page = 10
    infoList = []
    url = 'https://s.taobao.com/search'
    for i in range(page):
        s_num = str(44*i+1)
        num = 44*i
        data = {'q':goods,'s':s_num}

Step 2: Get Web Page Information

def getHTMLText(url,data):
    try:
        rsq = requests.get(url,params=data,timeout=30)
        rsq.raise_for_status()
        return rsq.text
    except:
        return "No page found"

Step 3: Get the data you need using the rules

def parasePage(ilt, html,goods_id):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
        slt = re.findall(r'\"view_sales\"\:\".*?\"', html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
        ult = re.findall(r'\"pic_url\"\:\".*?\"', html)
        dlt = re.findall(r'\"detail_url\"\:\".*?\"', html)
        for i in range(len(plt)):
            goods_id += 1
            price = eval(plt[i].split(':')[1])
            sales = eval(slt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            pic_url = "https:" + eval(ult[i].split(':')[1])
            detail_url = "https:" + eval(dlt[i].split(':')[1])
            ilt.append([goods_id,price,sales,title,pic_url,detail_url])
        return ilt
    except:
        print("We can't find what you need!")

Step 4: Save the data to a csv file

def saveGoodsList(ilt):
    with open('goods.csv','w') as f:
        writer = csv.writer(f)
        writer.writerow(["serial number", "Price", "volume", "Commodity Name","Merchandise Picture Web Address","Merchandise Details Web Site"])
        for info in ilt:
            writer.writerow(info)

Here is the complete code:

import csv
import requests
import re
 
#Get Web Page Information
def getHTMLText(url,data):
    try:
        rsq = requests.get(url,params=data,timeout=30)
        rsq.raise_for_status()
        return rsq.text
    except:
        return "No page found"
 
#Get the data you need regularly
def parasePage(ilt, html,goods_id):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
        slt = re.findall(r'\"view_sales\"\:\".*?\"', html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
        ult = re.findall(r'\"pic_url\"\:\".*?\"', html)
        dlt = re.findall(r'\"detail_url\"\:\".*?\"', html)
        for i in range(len(plt)):
            goods_id += 1
            price = eval(plt[i].split(':')[1])
            sales = eval(slt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            pic_url = "https:" + eval(ult[i].split(':')[1])
            detail_url = "https:" + eval(dlt[i].split(':')[1])
            ilt.append([goods_id,price,sales,title,pic_url,detail_url])
        return ilt
    except:
        print("We can't find what you need!")
 
#Save data to csv file
def saveGoodsList(ilt):
    with open('goods.csv','w') as f:
        writer = csv.writer(f)
        writer.writerow(["serial number", "Price", "volume", "Commodity Name","Merchandise Picture Web Address","Merchandise Details Web Site"])
        for info in ilt:
            writer.writerow(info)
 
#Run the program
if __name__ == '__main__':
    #Build a url for access
    goods = "Fishtail skirt"
    page = 10
    infoList = []
    url = 'https://s.taobao.com/search'
    for i in range(page):
        s_num = str(44*i+1)
        num = 44*i
        data = {'q':goods,'s':s_num}
        try:
            html = getHTMLText(url,data)
            ilt = parasePage(infoList, html,num)
            saveGoodsList(ilt)
        except:
            continue

The results are as follows:

Posted by dxdolar on Thu, 20 Feb 2020 08:17:44 -0800