Using Semantic Chunking
Semantic encoding is essential for applications where maintaining the logical flow and meaning of text is critical, such as in document retrieval, question answering, or summarization. This approach ensures that embeddings capture the full intent and nuance of the original content, enhancing downstream model performance.
import embed_anything
from embed_anything import EmbeddingModel, TextEmbedConfig, WhichModel
model = EmbeddingModel.from_pretrained_hf(
WhichModel.Jina, model_id="jinaai/jina-embeddings-v2-small-en"
)
# with semantic encoder
semantic_encoder = EmbeddingModel.from_pretrained_hf(
WhichModel.Jina, model_id="jinaai/jina-embeddings-v2-small-en"
)
config = TextEmbedConfig(
chunk_size=256,
batch_size=32,
splitting_strategy="semantic",
semantic_encoder=semantic_encoder,
)
data = embed_anything.embed_file("test_files/bank.txt", embedder=model, config=config)
for d in data:
print(d.text)
print("---" * 20)