Today, when I was playing Weibo, I accidentally thought of using python to crawl and download a few pictures. However, it is found that Weibo is different from ordinary websites,
To get a little content, you need to get a cookie from the browser first. Otherwise, it can't jump automatically. At present, I haven't found a good solution, so I found out later
If you want to update it, just use it for a while.
#coding=utf-8 #micro-blog #Download several pictures # from lxml import etree import requests import re import os #Reptile head headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Cookie' : 'UM_distinctid=15edca346a9aa5-0e9dbf8c2d480e-c303767-1fa400-15edca346aa7f1; SINAGLOBAL=6419057313447.329.1506940241879; SUB=_2AkMtGX-odcPxrAZXmvkQz2ngaYlH-jyezBZeAn7uJhMyAxgv7gcrqSdutBF-XKGzh9qGX4VXKOXpwXgdx6xYmTh9; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9W5Z6K5XQdMeqjmQKwInwfgd5JpVF02N1hzc1h-Eeo5f; UOR=www.baidu.com,vdisk.weibo.com,www.baidu.com; login_sid_t=2005b825ef14960898222c97937bfee8; cross_origin_proto=SSL; YF-Ugrow-G0=5b31332af1361e117ff29bb32e4d8439; YF-V5-G0=69afb7c26160eb8b724e8855d7b705c6; _s_tentry=www.baidu.com; Apache=2728703623251.8203.1516764166064; ULV=1516764166071:35:7:1:2728703623251.8203.1516764166064:1516351741102' } session = requests.Session() r = session.get(url='https://weibo.com/',headers=headers) print(r.status_code) #Get return status #print(r.text.encode('utf-8')) #Print decoded return data #xpath parsing xpath_html = etree.HTML(r.text.encode('utf-8')) src_list = xpath_html.xpath("//script") #Save url photo_album = [] for s in src_list: try: # print s.text.encode('utf-8') text = s.text.encode('utf-8') data = re.findall(r'<img.*?src=(.*?) .*?>',text) print len(data) if data: for d in data: if d: photo_album.append(d.replace('\\',"")) print d.replace('\\',"") except: pass # print src_list # print len(src_list) i= 0 for e in photo_album: #Sort out the url e = e.replace('"','') print e try: #Get remote picture content pic = requests.get(e,timeout=10) except: print 'Picture cannot be downloaded' continue #Check whether the picture saving directory has been created cwd = os.getcwd() store_dir = os.path.join(cwd,'download') if not os.path.isdir(store_dir): os.mkdir(store_dir) #File download string = os.path.join(cwd,'download',str(i) + '.' + e.split('.')[-1]) try: fp = open(string,'wb') fp.write(pic.content) fp.close() except: continue i += 1 #api for asynchronous loading of microblog #Change its parameters for more. #However, it should be noted that the data returned by Weibo is written directly in the < script > tag. Therefore, after getting the script tag, the text in the script tag (i.e. regular expression) cannot be directly parsed with xpath and other tools. #https://weibo.com/a/aj/transform/loadingmoreunlogin?ajwvr=6&category=0&page=2&lefnav=0&__rnd=1516764168254 # r = requests.get(url='https://weibo.com/a/aj/transform/loadingmoreunlogin?ajwvr=6&category=0&page=2&lefnav=0&__rnd=1516764168254/') # print(r.status_code) # print(r.text.encode('utf-8'))