Python docx operation word file (*. Docx)

Keywords: Python Google xml github

Catalog

Basic operation

from docx import Document
from docx.shared import Inches

# Create an empty document
document = Document()

# Add Title, set level level, 0 as Title, 1 or omit as Heading 1, 0 < = level < = 9
document.add_heading('Document Title', 0)
# Add paragraph with text = '' and style=None
p = document.add_paragraph('A plain paragraph having some ')
# Add a run object with parameters text=None and style=None,
# The run object has bold (BOLD) and italic (italics) attributes
p.add_run('bold').bold = True
p.add_run(' and some ')
p.add_run('italic.').italic = True

document.add_heading('Heading, level 1', level=1)
document.add_paragraph('Intense quote', style='Intense Quote')

document.add_paragraph(
    'first item in unordered list', style='List Bullet'
)
document.add_paragraph(
    'first item in ordered list', style='List Number'
)
# Add pictures
document.add_picture('monty-truth.png', width=Inches(1.25))

# Add form
records = (
    (3, '101', 'Spam'),
    (7, '422', 'Eggs'),
    (4, '631', 'Spam, spam, eggs, and spam')
)

table = document.add_table(rows=1, cols=3)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Qty'
hdr_cells[1].text = 'Id'
hdr_cells[2].text = 'Desc'
for qty, id, desc in records:
    row_cells = table.add_row().cells
    row_cells[0].text = str(qty)
    row_cells[1].text = id
    row_cells[2].text = desc

document.add_page_break()

Object relationship

After document. Add? Paragraph(), the content of the default paragraph is in the first run.

Add styles

Chinese font Microsoft YaHei, Western Font Times New Roman

import docx
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.shared import Cm, Pt

document = Document()
# Set a blank style
style = document.styles['Normal']
# Set Western Font
style.font.name = 'Times New Roman'
# Set Chinese font
style.element.rPr.rFonts.set(qn('w:eastAsia'), 'Microsoft YaHei')

text-indent

# Get paragraph style
paragraph_format = style.paragraph_format
# Indent the first line by 0.74 cm, i.e. 2 characters
paragraph_format.first_line_indent = Cm(0.74)

Style title separately

# Set title
title_ = document.add_heading(level=0)
# Title centered
title_.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Add title content
title_run = title_.add_run(title)
# Set title font size
title_run.font.size = Pt(14)
# Set Title Western Font
title_run.font.name = 'Times New Roman'
# Set Title Chinese font
title_run.element.rPr.rFonts.set(qn('w:eastAsia'), 'Microsoft YaHei')

Set hyperlink

def add_hyperlink(paragraph, url, text, color, underline):
    """
    A function that places a hyperlink within a paragraph object.

    :param paragraph: The paragraph we are adding the hyperlink to.
    :param url: A string containing the required url
    :param text: The text displayed for the url
    :return: The hyperlink object
    """

    # This gets access to the document.xml.rels file and gets a new relation id value
    part = paragraph.part
    r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

    # Create the w:hyperlink tag and add needed values
    hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
    hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

    # Create a w:r element
    new_run = docx.oxml.shared.OxmlElement('w:r')

    # Create a new w:rPr element
    rPr = docx.oxml.shared.OxmlElement('w:rPr')

    # Add color if it is given
    if not color is None:
        c = docx.oxml.shared.OxmlElement('w:color')
        c.set(docx.oxml.shared.qn('w:val'), color)
        rPr.append(c)

    # Remove underlining if it is requested
    if not underline:
        u = docx.oxml.shared.OxmlElement('w:u')
        u.set(docx.oxml.shared.qn('w:val'), 'none')
        rPr.append(u)

    # Join all the xml elements together add add the required text to the w:r element
    new_run.append(rPr)
    new_run.text = text
    hyperlink.append(new_run)

    paragraph._p.append(hyperlink)

    return hyperlink

document = docx.Document()
p = document.add_paragraph()

#add a hyperlink with the normal formatting (blue underline)
hyperlink = add_hyperlink(p, 'http://www.google.com', 'Google', None, True)

#add a hyperlink with a custom color and no underline
hyperlink = add_hyperlink(p, 'http://www.google.com', 'Google', 'FF8822', False)

document.save('demo.docx')

The above function is to directly add links to the whole content. In daily use, hyperlinks are mostly keywords or the format of < a > tags. The relationship between the two objects, paragraph and run, is used to solve this problem.

For example, if the text content is as follows, replace the < a > tag with a hyperlink:

"""I am trying to add an hyperlink in a MS Word document using docx module for <a href="python.org">Python</a>. Just do it."""

# Determine whether the field is a link
def is_text_link(text):
    for i in ['http', '://', 'www.', '.com', '.org', '.cn', '.xyz', '.htm']:
        if i in text:
            return True
        else:
            return False

# Hyperlink links in paragraphs
def add_text_link(document, text):
    paragraph = document.add_paragraph()
    # Split text content according to < a > tags
    text = re.split(r'<a href="|">|</a>',text)
    keyword = None
    for i in range(len(text)):
        # For non link and non keyword content, run directly into the paragraph
        if not is_text_link(text[i]):
            if text[i] != keyword:
                paragraph.add_run(text[i])
        # For links and keywords, use add? Hyperlink to insert hyperlinks
        elif i + 1<len(text):
            url=text[i]
            keyword=text[i + 1]
            add_hyperlink(paragraph, url, keyword, None, True)

Reference document

  1. https://python-docx.readthedocs.io/en/latest/index.html
  2. https://github.com/python-openxml/python-docx/issues/74
  3. http://www.warmeng.com/2018/12/02/auto_report/

Posted by GarroteYou on Fri, 22 Nov 2019 11:23:17 -0800