mirror of https://github.com/langgenius/dify.git
Merge 891d8a5d99 into 49a1fae555
This commit is contained in:
commit
a42bb327f5
|
|
@ -53,7 +53,10 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
|
|||
def __init__(self, fixed_separator: str = "\n\n", separators: list[str] | None = None, **kwargs: Any):
|
||||
"""Create a new TextSplitter."""
|
||||
super().__init__(**kwargs)
|
||||
self._fixed_separator = codecs.decode(fixed_separator, "unicode_escape")
|
||||
if "\\" in fixed_separator:
|
||||
self._fixed_separator = codecs.decode(fixed_separator, "unicode_escape")
|
||||
else:
|
||||
self._fixed_separator = fixed_separator
|
||||
self._separators = separators or ["\n\n", "\n", "。", ". ", " ", ""]
|
||||
|
||||
def split_text(self, text: str) -> list[str]:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,140 @@
|
|||
"""
|
||||
Test for invisible Unicode delimiter support in FixedRecursiveCharacterTextSplitter.
|
||||
|
||||
Regression test for issue #31672:
|
||||
ZWSP and other invisible Unicode characters should work as delimiters.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from core.rag.splitter.fixed_text_splitter import FixedRecursiveCharacterTextSplitter
|
||||
|
||||
|
||||
class TestInvisibleDelimiters:
|
||||
"""Test invisible Unicode characters as delimiters."""
|
||||
|
||||
@pytest.fixture
|
||||
def base_splitter_kwargs(self):
|
||||
"""Common kwargs for creating splitters."""
|
||||
|
||||
def length_function(texts: list[str]) -> list[int]:
|
||||
return [len(text) for text in texts]
|
||||
|
||||
return {
|
||||
"chunk_size": 100,
|
||||
"chunk_overlap": 0,
|
||||
"length_function": length_function,
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"delimiter",
|
||||
[
|
||||
pytest.param("\u200b", id="zwsp"),
|
||||
pytest.param("\ufeff", id="zwnbsp"),
|
||||
pytest.param("\u2063", id="invisible_separator"),
|
||||
pytest.param("\u2060", id="word_joiner"),
|
||||
pytest.param("\u200e", id="ltr_mark"),
|
||||
],
|
||||
)
|
||||
def test_invisible_literal_delimiters(self, base_splitter_kwargs, delimiter):
|
||||
"""Test that various invisible Unicode characters work as literal delimiters."""
|
||||
splitter = FixedRecursiveCharacterTextSplitter(
|
||||
fixed_separator=delimiter,
|
||||
**base_splitter_kwargs,
|
||||
)
|
||||
|
||||
text = f"chunk1{delimiter}chunk2{delimiter}chunk3"
|
||||
result = splitter.split_text(text)
|
||||
|
||||
assert len(result) == 3
|
||||
assert result == ["chunk1", "chunk2", "chunk3"]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("escaped_char", "literal_char"),
|
||||
[
|
||||
pytest.param("\\n", "\n", id="newline"),
|
||||
pytest.param("\\t", "\t", id="tab"),
|
||||
],
|
||||
)
|
||||
def test_escaped_chars_still_work(self, base_splitter_kwargs, escaped_char, literal_char):
|
||||
"""Escaped characters should still be decoded properly."""
|
||||
splitter = FixedRecursiveCharacterTextSplitter(
|
||||
fixed_separator=escaped_char,
|
||||
**base_splitter_kwargs,
|
||||
)
|
||||
|
||||
text = f"chunk1{literal_char}chunk2{literal_char}chunk3"
|
||||
result = splitter.split_text(text)
|
||||
|
||||
assert len(result) == 3
|
||||
assert result == ["chunk1", "chunk2", "chunk3"]
|
||||
|
||||
def test_literal_newline_works(self, base_splitter_kwargs):
|
||||
"""Literal newline should work without escaping."""
|
||||
splitter = FixedRecursiveCharacterTextSplitter(
|
||||
fixed_separator="\n",
|
||||
**base_splitter_kwargs,
|
||||
)
|
||||
|
||||
text = "chunk1\nchunk2\nchunk3"
|
||||
result = splitter.split_text(text)
|
||||
|
||||
assert len(result) == 3
|
||||
assert result == ["chunk1", "chunk2", "chunk3"]
|
||||
|
||||
def test_chinese_punctuation_literal(self, base_splitter_kwargs):
|
||||
"""Chinese punctuation should work as literal delimiters (related to issue)."""
|
||||
# Test Chinese comma
|
||||
splitter = FixedRecursiveCharacterTextSplitter(
|
||||
fixed_separator=",",
|
||||
**base_splitter_kwargs,
|
||||
)
|
||||
|
||||
text = "chunk1,chunk2,chunk3"
|
||||
result = splitter.split_text(text)
|
||||
|
||||
assert len(result) == 3
|
||||
assert result == ["chunk1", "chunk2", "chunk3"]
|
||||
|
||||
def test_mixed_content_with_zwsp(self, base_splitter_kwargs):
|
||||
"""Test realistic content with ZWSP delimiters."""
|
||||
zwsp = "\u200b"
|
||||
splitter = FixedRecursiveCharacterTextSplitter(
|
||||
fixed_separator=zwsp,
|
||||
**base_splitter_kwargs,
|
||||
)
|
||||
|
||||
text = f"First paragraph with some text.{zwsp}Second paragraph with more content.{zwsp}Third paragraph here."
|
||||
result = splitter.split_text(text)
|
||||
|
||||
assert len(result) == 3
|
||||
assert "First paragraph" in result[0]
|
||||
assert "Second paragraph" in result[1]
|
||||
assert "Third paragraph" in result[2]
|
||||
|
||||
def test_empty_separator(self, base_splitter_kwargs):
|
||||
"""Empty separator should not split."""
|
||||
splitter = FixedRecursiveCharacterTextSplitter(
|
||||
fixed_separator="",
|
||||
**base_splitter_kwargs,
|
||||
)
|
||||
|
||||
text = "chunk1 chunk2 chunk3"
|
||||
result = splitter.split_text(text)
|
||||
|
||||
# Should not split on empty separator
|
||||
assert len(result) == 1
|
||||
|
||||
def test_escaped_unicode_hex_notation(self, base_splitter_kwargs):
|
||||
"""Escaped Unicode hex notation \\u200b should be decoded."""
|
||||
splitter = FixedRecursiveCharacterTextSplitter(
|
||||
fixed_separator="\\u200b",
|
||||
**base_splitter_kwargs,
|
||||
)
|
||||
|
||||
zwsp = "\u200b"
|
||||
text = f"chunk1{zwsp}chunk2{zwsp}chunk3"
|
||||
result = splitter.split_text(text)
|
||||
|
||||
assert len(result) == 3
|
||||
assert result == ["chunk1", "chunk2", "chunk3"]
|
||||
Loading…
Reference in New Issue