Python data interpretation of the most popular bread in Jingdong

Keywords: Lambda JSON encoding

Preface

 

What's the best brand of bread to sell? What is the most popular taste? I believe that every friend who likes bread will care about these problems. By crawling the bread data of Jingdong, on the one hand, this paper answers all kinds of questions about bread before, on the other hand, it brings you a complete data report, which can be used for reference.

 

 

Data crawling (partial code)

 

 

 

Build agent for parsing detail page

 

 

 

def disguiser():
'''
//Build agent for parsing detail page
'''
try:
req = request.Request('http://www.agent.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=8f75fb741de34cfb95adf347910db7a9&orderno=YZ20191169208Yi1jmu&returnType=2&count=1')
resp = request.urlopen(req)
jsonIP = resp.read().decode()
jsonIP = re.sub(' ','',jsonIP)
ipList = re.findall('"ip":"(.*?)"',jsonIP)
portList = re.findall('"port":"(.*?)"',jsonIP)
value = list(map(lambda x,y : x + ':' + y,ipList, portList))
key = ['http']
ipDict = {key[index] : value[index] for index in range(len(key))}
print(ipDict)
# 1. Use ProxyHandler and pass in agent to build a handler
handler = request.ProxyHandler(ipDict) # key: http/https val: ip:port
# 2. Use the handler created above to build an opener
opener = request.build_opener(handler)
print(opener)
except:
time.sleep(6)
req = request.Request('http://www.agent.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=8f75fb741de34cfb95adf347910db7a9&orderno=YZ20191169208Yi1jmu&returnType=2&count=1')
resp = request.urlopen(req)
jsonIP = resp.read().decode()
jsonIP = re.sub(' ','',jsonIP)
ipList = re.findall('"ip":"(.*?)"',jsonIP)
portList = re.findall('"port":"(.*?)"',jsonIP)
value = list(map(lambda x,y : x + ':' + y,ipList, portList))
key = ['http']
ipDict = {key[index] : value[index] for index in range(len(key))}
print(ipDict)
# 1. Use ProxyHandler and pass in agent to build a handler
handler = request.ProxyHandler(ipDict) # key: http/https val: ip:port
# 2. Use the handler created above to build an opener
opener = request.build_opener(handler)
return opener

 

 

 

Analyze the contents of the details page

def parser(pageQueue, uaPool, priceRequestDoc, PRICEBASEURL, detailRequestDoc, open):
'''
//Analyze the contents of the details page
'''
detailUrl = pageQueue.get()[1]
print(detailUrl)
# Price
PRICEURL = PRICEBASEURL + re.search('\d+',detailUrl).group()
priceRequestDoc = re.sub(r' ','',priceRequestDoc)
headers_for_price = dict(re.findall('([-\w\d]*?):(.*)',priceRequestDoc))
headers_for_price.update(uaPool[random.randint(0,len(uaPool)-1)]) # Get headers information of commodity price information request
req = request.Request(PRICEURL, headers = headers_for_price)
resp = open(req) #First response
print(PRICEURL,'Commodity price page request response code:',resp.getcode())
if resp.getcode() == 200:
info = resp.read().decode()
elif SERVER_ERROR_MIN <= response.status_code < SERVER_ERROR_MAX:
for i in range(5):
time.sleep(i**i) #Can continue to optimize, the first 1 second, the second 10 seconds, the third 100 seconds
resp = open(req)
if resp.getcode() == 200:
break
elif SERVER_ERROR_MIN <= response.status_code < SERVER_ERROR_MAX:
if response.status_code == 404:
print('page not found')
elif response.status_code == 403:
print('have no right')
else:
pass
info = json.loads(info)
item_price = info[0]['p']

# Whether the name and brand contain sugar shelf life ingredients packaging goods origin
detailRequestDoc = re.sub(r' ','',detailRequestDoc)
headers_for_detail = dict(re.findall('([-\w\d:]*):(.*)',detailRequestDoc))
headers_for_detail.update(uaPool[random.randint(0,9)]) # Get headers information of commodity price information request
req = request.Request(detailUrl, headers = headers_for_detail)
resp = open(req) # Second response
print(detailUrl,'Detail page request response:',resp.getcode())
if resp.getcode() == 200:
pass
elif SERVER_ERROR_MIN <= response.status_code < SERVER_ERROR_MAX:
for i in range(5):
time.sleep(i**i) #Can continue to optimize, the first 1 second, the second 10 seconds, the third 100 seconds
resp = open(req)
if resp.getcode() == 200:
break
elif SERVER_ERROR_MIN <= response.status_code < SERVER_ERROR_MAX:
if response.status_code == 404:
print(detailUrl,'page not found')
elif response.status_code == 403:
print(detailUrl,'have no right')
else:
pass
parser = etree.HTMLParser(encoding = 'gbk')
html = etree.parse(resp, parser = parser)
print(html)
elements = html.xpath("//ul[@class='parameter2 p-parameter-list']//text() | //dl[@class='clearfix']//text()")
detailInfo = list(filter(lambda msg : len(msg.strip()) > 0 and msg, elements))
detailInfo = ('#').join(detailInfo)
try:
item_name = re.search('Commodity name:(.*?)#',detailInfo).group(1)
except AttributeError:
# print('item name information is not available for the product ')
item_name = 'n'
try:
item_id = re.search('\d+',detailUrl).group()
except AttributeError:
# print('item 'ID information is not available for the item')
item_id = 'n'
# Commodity name
elementTitle = html.xpath("//title//text()")[0]
elementTitle = elementTitle.strip()
item_fullName = re.search('([.*])*(.*)?[',elementTitle).group(2)
# brand
elementBrand = html.xpath("//*[@id='crumb-wrap']/div/div[1]/div[7]/div/div/div[1]/a/text()")
elementBrand = list(filter(lambda msg : len(msg.strip()) > 0 and msg, elementBrand))
try:
item_brand = elementBrand[0]
except IndexError:
item_brand = 'npl'
yield {
'item_id':item_id,
'item_fullName':item_fullName,
'item_name':item_name,
'item_price':item_price,
'item_brand':item_brand,
'gross_weight':gross_weight,
'item_origin':item_origin,
'item_certification':item_certification,
'processing_technology':processing_technology,
'packing_unit':packing_unit,
'is_suger':is_suger
}

 

 

Since the number of public addresses is limited, we cannot display all the code of this article. We have put the code into Baidu cloud disk. Reply "bread" in the background, you can get the code of this article.

 

 

Analysis text

 

 

 

Posted by mechamecha on Mon, 18 Nov 2019 11:01:58 -0800