mirror of https://github.com/langgenius/dify.git
fix: correct docx hyperlink extraction (#30360)
This commit is contained in:
parent
f320fd5f95
commit
114a34e008
|
|
@ -7,10 +7,11 @@ import re
|
|||
import tempfile
|
||||
import uuid
|
||||
from urllib.parse import urlparse
|
||||
from xml.etree import ElementTree
|
||||
|
||||
import httpx
|
||||
from docx import Document as DocxDocument
|
||||
from docx.oxml.ns import qn
|
||||
from docx.text.run import Run
|
||||
|
||||
from configs import dify_config
|
||||
from core.helper import ssrf_proxy
|
||||
|
|
@ -229,44 +230,20 @@ class WordExtractor(BaseExtractor):
|
|||
|
||||
image_map = self._extract_images_from_docx(doc)
|
||||
|
||||
hyperlinks_url = None
|
||||
url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+")
|
||||
for para in doc.paragraphs:
|
||||
for run in para.runs:
|
||||
if run.text and hyperlinks_url:
|
||||
result = f" [{run.text}]({hyperlinks_url}) "
|
||||
run.text = result
|
||||
hyperlinks_url = None
|
||||
if "HYPERLINK" in run.element.xml:
|
||||
try:
|
||||
xml = ElementTree.XML(run.element.xml)
|
||||
x_child = [c for c in xml.iter() if c is not None]
|
||||
for x in x_child:
|
||||
if x is None:
|
||||
continue
|
||||
if x.tag.endswith("instrText"):
|
||||
if x.text is None:
|
||||
continue
|
||||
for i in url_pattern.findall(x.text):
|
||||
hyperlinks_url = str(i)
|
||||
except Exception:
|
||||
logger.exception("Failed to parse HYPERLINK xml")
|
||||
|
||||
def parse_paragraph(paragraph):
|
||||
paragraph_content = []
|
||||
|
||||
def append_image_link(image_id, has_drawing):
|
||||
def append_image_link(image_id, has_drawing, target_buffer):
|
||||
"""Helper to append image link from image_map based on relationship type."""
|
||||
rel = doc.part.rels[image_id]
|
||||
if rel.is_external:
|
||||
if image_id in image_map and not has_drawing:
|
||||
paragraph_content.append(image_map[image_id])
|
||||
target_buffer.append(image_map[image_id])
|
||||
else:
|
||||
image_part = rel.target_part
|
||||
if image_part in image_map and not has_drawing:
|
||||
paragraph_content.append(image_map[image_part])
|
||||
target_buffer.append(image_map[image_part])
|
||||
|
||||
for run in paragraph.runs:
|
||||
def process_run(run, target_buffer):
|
||||
# Helper to extract text and embedded images from a run element and append them to target_buffer
|
||||
if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"):
|
||||
# Process drawing type images
|
||||
drawing_elements = run.element.findall(
|
||||
|
|
@ -287,13 +264,13 @@ class WordExtractor(BaseExtractor):
|
|||
# External image: use embed_id as key
|
||||
if embed_id in image_map:
|
||||
has_drawing = True
|
||||
paragraph_content.append(image_map[embed_id])
|
||||
target_buffer.append(image_map[embed_id])
|
||||
else:
|
||||
# Internal image: use target_part as key
|
||||
image_part = doc.part.related_parts.get(embed_id)
|
||||
if image_part in image_map:
|
||||
has_drawing = True
|
||||
paragraph_content.append(image_map[image_part])
|
||||
target_buffer.append(image_map[image_part])
|
||||
# Process pict type images
|
||||
shape_elements = run.element.findall(
|
||||
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict"
|
||||
|
|
@ -308,7 +285,7 @@ class WordExtractor(BaseExtractor):
|
|||
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
|
||||
)
|
||||
if image_id and image_id in doc.part.rels:
|
||||
append_image_link(image_id, has_drawing)
|
||||
append_image_link(image_id, has_drawing, target_buffer)
|
||||
# Find imagedata element in VML
|
||||
image_data = shape.find(".//{urn:schemas-microsoft-com:vml}imagedata")
|
||||
if image_data is not None:
|
||||
|
|
@ -316,9 +293,93 @@ class WordExtractor(BaseExtractor):
|
|||
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
|
||||
)
|
||||
if image_id and image_id in doc.part.rels:
|
||||
append_image_link(image_id, has_drawing)
|
||||
append_image_link(image_id, has_drawing, target_buffer)
|
||||
if run.text.strip():
|
||||
paragraph_content.append(run.text.strip())
|
||||
target_buffer.append(run.text.strip())
|
||||
|
||||
def process_hyperlink(hyperlink_elem, target_buffer):
|
||||
# Helper to extract text from a hyperlink element and append it to target_buffer
|
||||
r_id = hyperlink_elem.get(qn("r:id"))
|
||||
|
||||
# Extract text from runs inside the hyperlink
|
||||
link_text_parts = []
|
||||
for run_elem in hyperlink_elem.findall(qn("w:r")):
|
||||
run = Run(run_elem, paragraph)
|
||||
# Hyperlink text may be split across multiple runs (e.g., with different formatting),
|
||||
# so collect all run texts first
|
||||
if run.text:
|
||||
link_text_parts.append(run.text)
|
||||
|
||||
link_text = "".join(link_text_parts).strip()
|
||||
|
||||
# Resolve URL
|
||||
if r_id:
|
||||
try:
|
||||
rel = doc.part.rels.get(r_id)
|
||||
if rel and rel.is_external:
|
||||
link_text = f"[{link_text or rel.target_ref}]({rel.target_ref})"
|
||||
except Exception:
|
||||
logger.exception("Failed to resolve URL for hyperlink with r:id: %s", r_id)
|
||||
|
||||
if link_text:
|
||||
target_buffer.append(link_text)
|
||||
|
||||
paragraph_content = []
|
||||
# State for legacy HYPERLINK fields
|
||||
hyperlink_field_url = None
|
||||
hyperlink_field_text_parts: list = []
|
||||
is_collecting_field_text = False
|
||||
# Iterate through paragraph elements in document order
|
||||
for child in paragraph._element:
|
||||
tag = child.tag
|
||||
if tag == qn("w:r"):
|
||||
# Regular run
|
||||
run = Run(child, paragraph)
|
||||
|
||||
# Check for fldChar (begin/end/separate) and instrText for legacy hyperlinks
|
||||
fld_chars = child.findall(qn("w:fldChar"))
|
||||
instr_texts = child.findall(qn("w:instrText"))
|
||||
|
||||
# Handle Fields
|
||||
if fld_chars or instr_texts:
|
||||
# Process instrText to find HYPERLINK "url"
|
||||
for instr in instr_texts:
|
||||
if instr.text and "HYPERLINK" in instr.text:
|
||||
# Quick regex to extract URL
|
||||
match = re.search(r'HYPERLINK\s+"([^"]+)"', instr.text, re.IGNORECASE)
|
||||
if match:
|
||||
hyperlink_field_url = match.group(1)
|
||||
|
||||
# Process fldChar
|
||||
for fld_char in fld_chars:
|
||||
fld_char_type = fld_char.get(qn("w:fldCharType"))
|
||||
if fld_char_type == "begin":
|
||||
# Start of a field: reset legacy link state
|
||||
hyperlink_field_url = None
|
||||
hyperlink_field_text_parts = []
|
||||
is_collecting_field_text = False
|
||||
elif fld_char_type == "separate":
|
||||
# Separator: if we found a URL, start collecting visible text
|
||||
if hyperlink_field_url:
|
||||
is_collecting_field_text = True
|
||||
elif fld_char_type == "end":
|
||||
# End of field
|
||||
if is_collecting_field_text and hyperlink_field_url:
|
||||
# Create markdown link and append to main content
|
||||
display_text = "".join(hyperlink_field_text_parts).strip()
|
||||
if display_text:
|
||||
link_md = f"[{display_text}]({hyperlink_field_url})"
|
||||
paragraph_content.append(link_md)
|
||||
# Reset state
|
||||
hyperlink_field_url = None
|
||||
hyperlink_field_text_parts = []
|
||||
is_collecting_field_text = False
|
||||
|
||||
# Decide where to append content
|
||||
target_buffer = hyperlink_field_text_parts if is_collecting_field_text else paragraph_content
|
||||
process_run(run, target_buffer)
|
||||
elif tag == qn("w:hyperlink"):
|
||||
process_hyperlink(child, paragraph_content)
|
||||
return "".join(paragraph_content) if paragraph_content else ""
|
||||
|
||||
paragraphs = doc.paragraphs.copy()
|
||||
|
|
|
|||
|
|
@ -1,8 +1,12 @@
|
|||
"""Primarily used for testing merged cell scenarios"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from types import SimpleNamespace
|
||||
|
||||
from docx import Document
|
||||
from docx.oxml import OxmlElement
|
||||
from docx.oxml.ns import qn
|
||||
|
||||
import core.rag.extractor.word_extractor as we
|
||||
from core.rag.extractor.word_extractor import WordExtractor
|
||||
|
|
@ -165,3 +169,110 @@ def test_extract_images_from_docx_uses_internal_files_url():
|
|||
dify_config.FILES_URL = original_files_url
|
||||
if original_internal_files_url is not None:
|
||||
dify_config.INTERNAL_FILES_URL = original_internal_files_url
|
||||
|
||||
|
||||
def test_extract_hyperlinks(monkeypatch):
|
||||
# Mock db and storage to avoid issues during image extraction (even if no images are present)
|
||||
monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None))
|
||||
db_stub = SimpleNamespace(session=SimpleNamespace(add=lambda o: None, commit=lambda: None))
|
||||
monkeypatch.setattr(we, "db", db_stub)
|
||||
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
|
||||
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
|
||||
|
||||
doc = Document()
|
||||
p = doc.add_paragraph("Visit ")
|
||||
|
||||
# Adding a hyperlink manually
|
||||
r_id = "rId99"
|
||||
hyperlink = OxmlElement("w:hyperlink")
|
||||
hyperlink.set(qn("r:id"), r_id)
|
||||
|
||||
new_run = OxmlElement("w:r")
|
||||
t = OxmlElement("w:t")
|
||||
t.text = "Dify"
|
||||
new_run.append(t)
|
||||
hyperlink.append(new_run)
|
||||
p._p.append(hyperlink)
|
||||
|
||||
# Add relationship to the part
|
||||
doc.part.rels.add_relationship(
|
||||
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
|
||||
"https://dify.ai",
|
||||
r_id,
|
||||
is_external=True,
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
|
||||
doc.save(tmp.name)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
extractor = WordExtractor(tmp_path, "tenant_id", "user_id")
|
||||
docs = extractor.extract()
|
||||
# Verify modern hyperlink extraction
|
||||
assert "Visit[Dify](https://dify.ai)" in docs[0].page_content
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.remove(tmp_path)
|
||||
|
||||
|
||||
def test_extract_legacy_hyperlinks(monkeypatch):
|
||||
# Mock db and storage
|
||||
monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None))
|
||||
db_stub = SimpleNamespace(session=SimpleNamespace(add=lambda o: None, commit=lambda: None))
|
||||
monkeypatch.setattr(we, "db", db_stub)
|
||||
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
|
||||
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
|
||||
|
||||
doc = Document()
|
||||
p = doc.add_paragraph()
|
||||
|
||||
# Construct a legacy HYPERLINK field:
|
||||
# 1. w:fldChar (begin)
|
||||
# 2. w:instrText (HYPERLINK "http://example.com")
|
||||
# 3. w:fldChar (separate)
|
||||
# 4. w:r (visible text "Example")
|
||||
# 5. w:fldChar (end)
|
||||
|
||||
run1 = OxmlElement("w:r")
|
||||
fldCharBegin = OxmlElement("w:fldChar")
|
||||
fldCharBegin.set(qn("w:fldCharType"), "begin")
|
||||
run1.append(fldCharBegin)
|
||||
p._p.append(run1)
|
||||
|
||||
run2 = OxmlElement("w:r")
|
||||
instrText = OxmlElement("w:instrText")
|
||||
instrText.text = ' HYPERLINK "http://example.com" '
|
||||
run2.append(instrText)
|
||||
p._p.append(run2)
|
||||
|
||||
run3 = OxmlElement("w:r")
|
||||
fldCharSep = OxmlElement("w:fldChar")
|
||||
fldCharSep.set(qn("w:fldCharType"), "separate")
|
||||
run3.append(fldCharSep)
|
||||
p._p.append(run3)
|
||||
|
||||
run4 = OxmlElement("w:r")
|
||||
t4 = OxmlElement("w:t")
|
||||
t4.text = "Example"
|
||||
run4.append(t4)
|
||||
p._p.append(run4)
|
||||
|
||||
run5 = OxmlElement("w:r")
|
||||
fldCharEnd = OxmlElement("w:fldChar")
|
||||
fldCharEnd.set(qn("w:fldCharType"), "end")
|
||||
run5.append(fldCharEnd)
|
||||
p._p.append(run5)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
|
||||
doc.save(tmp.name)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
extractor = WordExtractor(tmp_path, "tenant_id", "user_id")
|
||||
docs = extractor.extract()
|
||||
# Verify legacy hyperlink extraction
|
||||
assert "[Example](http://example.com)" in docs[0].page_content
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.remove(tmp_path)
|
||||
|
|
|
|||
Loading…
Reference in New Issue