Python web crawler notes (3): Download blog garden essay to Word document

Keywords: Python pip less

(1) Description

Based on the previous one, we use lxml to extract the body content of blog garden essay and save it in Word document.

The following modules are used to operate Word documents:

pip install python-docx

Modified code (mainly added the following paragraph in the while loop of link_crawler())

 1         tree = lxml.html.fromstring(html) #analysis HTML For a unified format
 2         title = tree.xpath('//a[@id="cb_post_title_url"]') #Get title
 3         the_file = tree.xpath('//div[@id="cnblogs_post_body"]/p') #Get body content
 4         pre = tree.xpath('//pre') #Get the code part of the essay (inserted with the code insertion function of the blog Park)
 5         img = tree.xpath('//div[@id="cnblogs_post_body"]/p/img/@src') #Get pictures
 6         #Modify working directory
 7         os.chdir('F:\Python\worm\Blog Garden files')
 8         #Create a blank new Word File
 9         doc = docx.Document()
10         #Add title
11         doc.add_heading(title[0].text_content(), 0)
12         for i in the_file:
13             #Add the contents of each paragraph to Word Documents ( p Content of label)
14             doc.add_paragraph(i.text_content())
15         # Add code section to document
16         for p in pre:
17             doc.add_paragraph(p.text_content())
18         #Add picture to Word In document
19         for i in img:
20             ure.urlretrieve(i, '0.jpg')
21             doc.add_picture('0.jpg')
22         #Intercept the first 8 digits of the title as Word file name
23         filename = title[0].text_content()[:8] + '.docx'
24         #Preservation Word Document think
25         #If the filename already exists, set the filename to title[0].text_content()[:8]+ str(x).docx,Otherwise, set the filename to filename
26         if str(filename) in os.listdir('F:\Python\worm\Blog Garden files'):
27             doc.save(title[0].text_content()[:8] + str(x) + '.docx')
28             x += 1
29         else:
30             doc.save(filename)

(2) Complete code (the code of delayed.py will not be pasted out, as in the previous article)

It's better to set the speed limit a little higher. The following sentence, in seconds.

waitFor = WaitFor(2)
 1 import urllib.request as ure
 2 import re
 3 import urllib.parse
 4 from delayed import WaitFor
 5 import lxml.html
 6 import os
 7 import docx
 8 #Download Web page and return to HTML(Part of the dynamic load cannot be downloaded)
 9 def download(url,user_agent='FireDrich',num=2):
10     print('download:'+url)
11     #Set up user agent
12     headers = {'user_agent':user_agent}
13     request = ure.Request(url,headers=headers)
14     try:
15         #Download webpage
16         html = ure.urlopen(request).read()
17     except ure.URLError as e:
18         print('Download failed'+e.reason)
19         html=None
20         if num>0:
21             #Encounter 5 XX In case of an error, recursively call itself to retry the download, and repeat at most twice
22             if hasattr(e,'code') and 500<=e.code<600:
23                 return download(url,num-1)
24     return html
25 #seed_url Introduction of one url,for example https://www.cnblogs.com/
26 #link_regex Pass in a regular expression
27 #Function functions: extracting and link_regex All matching page links and Downloads
28 def link_crawler(seed_url, link_regex):
29     html = download(seed_url)
30     crawl_queue = []
31     #iteration get_links()Returns a list that matches the regular expression link_regex Links to add to the list
32     for link in get_links(html):
33         if re.match(link_regex, link):
34             #Splicing https://www.cnblogs.com/ and / cat /
35             link = urllib.parse.urljoin(seed_url, link)
36             #Do not add in list
37             if link not in crawl_queue:
38                 crawl_queue.append(link)
39     x = 0
40     #call WaitFor Of wait()Function, download speed limit, wait if the interval is less than 2 seconds,Continue downloading until the interval is equal to 2 seconds (continue downloading directly if it is greater than 5 seconds)
41     waitFor = WaitFor(2)
42     #download crawl_queue All pages in
43     while crawl_queue:
44         #Delete data at the end of the list
45         url = crawl_queue.pop()
46         waitFor.wait(url)
47         html = download(url)
48         tree = lxml.html.fromstring(html) #analysis HTML For a unified format
49         title = tree.xpath('//a[@id="cb_post_title_url"]') #Get title
50         the_file = tree.xpath('//div[@id="cnblogs_post_body"]/p') #Get body content
51         pre = tree.xpath('//pre') #Get the code part of the essay (inserted with the code insertion function of the blog Park)
52         img = tree.xpath('//div[@id="cnblogs_post_body"]/p/img/@src') #Get pictures
53         #Modify working directory
54         os.chdir('F:\Python\worm\Blog Garden files')
55         #Create a blank new Word File
56         doc = docx.Document()
57         #Add title
58         doc.add_heading(title[0].text_content(), 0)
59         for i in the_file:
60             #Add the contents of each paragraph to Word Documents ( p Content of label)
61             doc.add_paragraph(i.text_content())
62         # Add code section to document
63         for p in pre:
64             doc.add_paragraph(p.text_content())
65         #Add picture to Word In document
66         for i in img:
67             ure.urlretrieve(i, '0.jpg')
68             doc.add_picture('0.jpg')
69         #Intercept the first 8 digits of the title as Word file name
70         filename = title[0].text_content()[:8] + '.docx'
71         #Preservation Word File
72         #If the filename already exists, set the filename to title[0].text_content()[:8]+ str(x).docx,Otherwise, set the filename to filename
73         if str(filename) in os.listdir('F:\Python\worm\Blog Garden files'):
74             doc.save(title[0].text_content()[:8] + str(x) + '.docx')
75             x += 1
76         else:
77             doc.save(filename)
78 #afferent html Object to return all links as a list
79 def get_links(html):
80     #Using regular expression extraction html All page links in
81     webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
82     html = html.decode('utf-8')
83     # Return all web links as a list
84     return webpage_regex.findall(html)
85 
86 link_crawler('https://www.cnblogs.com/cate/python/','.*/www.cnblogs.com/.*?\.html$')

(3) Results

 

(4) Existing problems

(1) the code part is added after the body content. (if you have used the function of inserting code in blog Garden, the typesetting will be inconsistent.)

(2) The image is inserted directly after the code section. (if there is a picture inserted in the essay, the typesetting will be inconsistent)

Posted by phpDVWeaver on Tue, 31 Mar 2020 01:56:38 -0700