fix: correct docx hyperlink extraction (#30360)

This commit is contained in:
Zhiqiang Yang 2026-01-06 03:24:26 +00:00 committed by GitHub
parent f320fd5f95
commit 114a34e008
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 207 additions and 35 deletions

View File

@ -7,10 +7,11 @@ import re
import tempfile
import uuid
from urllib.parse import urlparse
from xml.etree import ElementTree
import httpx
from docx import Document as DocxDocument
from docx.oxml.ns import qn
from docx.text.run import Run
from configs import dify_config
from core.helper import ssrf_proxy
@ -229,44 +230,20 @@ class WordExtractor(BaseExtractor):
image_map = self._extract_images_from_docx(doc)
hyperlinks_url = None
url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+")
for para in doc.paragraphs:
for run in para.runs:
if run.text and hyperlinks_url:
result = f" [{run.text}]({hyperlinks_url}) "
run.text = result
hyperlinks_url = None
if "HYPERLINK" in run.element.xml:
try:
xml = ElementTree.XML(run.element.xml)
x_child = [c for c in xml.iter() if c is not None]
for x in x_child:
if x is None:
continue
if x.tag.endswith("instrText"):
if x.text is None:
continue
for i in url_pattern.findall(x.text):
hyperlinks_url = str(i)
except Exception:
logger.exception("Failed to parse HYPERLINK xml")
def parse_paragraph(paragraph):
paragraph_content = []
def append_image_link(image_id, has_drawing):
def append_image_link(image_id, has_drawing, target_buffer):
"""Helper to append image link from image_map based on relationship type."""
rel = doc.part.rels[image_id]
if rel.is_external:
if image_id in image_map and not has_drawing:
paragraph_content.append(image_map[image_id])
target_buffer.append(image_map[image_id])
else:
image_part = rel.target_part
if image_part in image_map and not has_drawing:
paragraph_content.append(image_map[image_part])
target_buffer.append(image_map[image_part])
for run in paragraph.runs:
def process_run(run, target_buffer):
# Helper to extract text and embedded images from a run element and append them to target_buffer
if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"):
# Process drawing type images
drawing_elements = run.element.findall(
@ -287,13 +264,13 @@ class WordExtractor(BaseExtractor):
# External image: use embed_id as key
if embed_id in image_map:
has_drawing = True
paragraph_content.append(image_map[embed_id])
target_buffer.append(image_map[embed_id])
else:
# Internal image: use target_part as key
image_part = doc.part.related_parts.get(embed_id)
if image_part in image_map:
has_drawing = True
paragraph_content.append(image_map[image_part])
target_buffer.append(image_map[image_part])
# Process pict type images
shape_elements = run.element.findall(
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict"
@ -308,7 +285,7 @@ class WordExtractor(BaseExtractor):
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
)
if image_id and image_id in doc.part.rels:
append_image_link(image_id, has_drawing)
append_image_link(image_id, has_drawing, target_buffer)
# Find imagedata element in VML
image_data = shape.find(".//{urn:schemas-microsoft-com:vml}imagedata")
if image_data is not None:
@ -316,9 +293,93 @@ class WordExtractor(BaseExtractor):
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
)
if image_id and image_id in doc.part.rels:
append_image_link(image_id, has_drawing)
append_image_link(image_id, has_drawing, target_buffer)
if run.text.strip():
paragraph_content.append(run.text.strip())
target_buffer.append(run.text.strip())
def process_hyperlink(hyperlink_elem, target_buffer):
# Helper to extract text from a hyperlink element and append it to target_buffer
r_id = hyperlink_elem.get(qn("r:id"))
# Extract text from runs inside the hyperlink
link_text_parts = []
for run_elem in hyperlink_elem.findall(qn("w:r")):
run = Run(run_elem, paragraph)
# Hyperlink text may be split across multiple runs (e.g., with different formatting),
# so collect all run texts first
if run.text:
link_text_parts.append(run.text)
link_text = "".join(link_text_parts).strip()
# Resolve URL
if r_id:
try:
rel = doc.part.rels.get(r_id)
if rel and rel.is_external:
link_text = f"[{link_text or rel.target_ref}]({rel.target_ref})"
except Exception:
logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id)
if link_text:
target_buffer.append(link_text)
paragraph_content = []
# State for legacy HYPERLINK fields
hyperlink_field_url = None
hyperlink_field_text_parts: list = []
is_collecting_field_text = False
# Iterate through paragraph elements in document order
for child in paragraph._element:
tag = child.tag
if tag == qn("w:r"):
# Regular run
run = Run(child, paragraph)
# Check for fldChar (begin/end/separate) and instrText for legacy hyperlinks
fld_chars = child.findall(qn("w:fldChar"))
instr_texts = child.findall(qn("w:instrText"))
# Handle Fields
if fld_chars or instr_texts:
# Process instrText to find HYPERLINK "url"
for instr in instr_texts:
if instr.text and "HYPERLINK" in instr.text:
# Quick regex to extract URL
match = re.search(r'HYPERLINK\s+"([^"]+)"', instr.text, re.IGNORECASE)
if match:
hyperlink_field_url = match.group(1)
# Process fldChar
for fld_char in fld_chars:
fld_char_type = fld_char.get(qn("w:fldCharType"))
if fld_char_type == "begin":
# Start of a field: reset legacy link state
hyperlink_field_url = None
hyperlink_field_text_parts = []
is_collecting_field_text = False
elif fld_char_type == "separate":
# Separator: if we found a URL, start collecting visible text
if hyperlink_field_url:
is_collecting_field_text = True
elif fld_char_type == "end":
# End of field
if is_collecting_field_text and hyperlink_field_url:
# Create markdown link and append to main content
display_text = "".join(hyperlink_field_text_parts).strip()
if display_text:
link_md = f"[{display_text}]({hyperlink_field_url})"
paragraph_content.append(link_md)
# Reset state
hyperlink_field_url = None
hyperlink_field_text_parts = []
is_collecting_field_text = False
# Decide where to append content
target_buffer = hyperlink_field_text_parts if is_collecting_field_text else paragraph_content
process_run(run, target_buffer)
elif tag == qn("w:hyperlink"):
process_hyperlink(child, paragraph_content)
return "".join(paragraph_content) if paragraph_content else ""
paragraphs = doc.paragraphs.copy()

View File

@ -1,8 +1,12 @@
"""Primarily used for testing merged cell scenarios"""
import os
import tempfile
from types import SimpleNamespace
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
import core.rag.extractor.word_extractor as we
from core.rag.extractor.word_extractor import WordExtractor
@ -165,3 +169,110 @@ def test_extract_images_from_docx_uses_internal_files_url():
dify_config.FILES_URL = original_files_url
if original_internal_files_url is not None:
dify_config.INTERNAL_FILES_URL = original_internal_files_url
def test_extract_hyperlinks(monkeypatch):
# Mock db and storage to avoid issues during image extraction (even if no images are present)
monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None))
db_stub = SimpleNamespace(session=SimpleNamespace(add=lambda o: None, commit=lambda: None))
monkeypatch.setattr(we, "db", db_stub)
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
doc = Document()
p = doc.add_paragraph("Visit ")
# Adding a hyperlink manually
r_id = "rId99"
hyperlink = OxmlElement("w:hyperlink")
hyperlink.set(qn("r:id"), r_id)
new_run = OxmlElement("w:r")
t = OxmlElement("w:t")
t.text = "Dify"
new_run.append(t)
hyperlink.append(new_run)
p._p.append(hyperlink)
# Add relationship to the part
doc.part.rels.add_relationship(
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
"https://dify.ai",
r_id,
is_external=True,
)
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
doc.save(tmp.name)
tmp_path = tmp.name
try:
extractor = WordExtractor(tmp_path, "tenant_id", "user_id")
docs = extractor.extract()
# Verify modern hyperlink extraction
assert "Visit[Dify](https://dify.ai)" in docs[0].page_content
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
def test_extract_legacy_hyperlinks(monkeypatch):
# Mock db and storage
monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None))
db_stub = SimpleNamespace(session=SimpleNamespace(add=lambda o: None, commit=lambda: None))
monkeypatch.setattr(we, "db", db_stub)
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
doc = Document()
p = doc.add_paragraph()
# Construct a legacy HYPERLINK field:
# 1. w:fldChar (begin)
# 2. w:instrText (HYPERLINK "http://example.com")
# 3. w:fldChar (separate)
# 4. w:r (visible text "Example")
# 5. w:fldChar (end)
run1 = OxmlElement("w:r")
fldCharBegin = OxmlElement("w:fldChar")
fldCharBegin.set(qn("w:fldCharType"), "begin")
run1.append(fldCharBegin)
p._p.append(run1)
run2 = OxmlElement("w:r")
instrText = OxmlElement("w:instrText")
instrText.text = ' HYPERLINK "http://example.com" '
run2.append(instrText)
p._p.append(run2)
run3 = OxmlElement("w:r")
fldCharSep = OxmlElement("w:fldChar")
fldCharSep.set(qn("w:fldCharType"), "separate")
run3.append(fldCharSep)
p._p.append(run3)
run4 = OxmlElement("w:r")
t4 = OxmlElement("w:t")
t4.text = "Example"
run4.append(t4)
p._p.append(run4)
run5 = OxmlElement("w:r")
fldCharEnd = OxmlElement("w:fldChar")
fldCharEnd.set(qn("w:fldCharType"), "end")
run5.append(fldCharEnd)
p._p.append(run5)
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
doc.save(tmp.name)
tmp_path = tmp.name
try:
extractor = WordExtractor(tmp_path, "tenant_id", "user_id")
docs = extractor.extract()
# Verify legacy hyperlink extraction
assert "[Example](http://example.com)" in docs[0].page_content
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)