Skip to content

📚 References

This module provides functions and classes for embedding queries, files, and directories using different embedding models.

The module includes the following functions:

  • embed_query: Embeds the given query and returns an EmbedData object.
  • embed_file: Embeds the file at the given path and returns a list of EmbedData objects.
  • embed_directory: Embeds all the files in the given directory and returns a list of EmbedData objects.

The module also includes the EmbedData class, which represents the data of an embedded file.

Usage:

import embed_anything
from embed_anything import EmbedData

#For text files

model = EmbeddingModel.from_pretrained_local(
    WhichModel.Bert, model_id="Hugging_face_link"
)
data = embed_anything.embed_file("test_files/test.pdf", embedder=model)


#For images
model = embed_anything.EmbeddingModel.from_pretrained_local(
    embed_anything.WhichModel.Clip,
    model_id="openai/clip-vit-base-patch16",
    # revision="refs/pr/15",
)
data: list[EmbedData] = embed_anything.embed_directory("test_files", embedder=model)
embeddings = np.array([data.embedding for data in data])
query = ["Photo of a monkey?"]
query_embedding = np.array(
    embed_anything.embed_query(query, embedder=model)[0].embedding
)
# For audio files
from embed_anything import (
    AudioDecoderModel,
    EmbeddingModel,
    embed_audio_file,
    TextEmbedConfig,
)
# choose any whisper or distilwhisper model from https://huggingface.co/distil-whisper or https://huggingface.co/collections/openai/whisper-release-6501bba2cf999715fd953013
audio_decoder = AudioDecoderModel.from_pretrained_hf(
    "openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False
)
embedder = EmbeddingModel.from_pretrained_hf(
    embed_anything.WhichModel.Bert,
    model_id="sentence-transformers/all-MiniLM-L6-v2",
    revision="main",
)
config = TextEmbedConfig(chunk_size=200, batch_size=32)
data = embed_anything.embed_audio_file(
    "test_files/audio/samples_hp0.wav",
    audio_decoder=audio_decoder,
    embedder=embedder,
    text_embed_config=config,
)

You can also store the embeddings to a vector database and not keep them on memory. Here is an example of how to use the PineconeAdapter class:

import embed_anything
import os

from embed_anything.vectordb import PineconeAdapter


# Initialize the PineconeEmbedder class
api_key = os.environ.get("PINECONE_API_KEY")
index_name = "anything"
pinecone_adapter = PineconeAdapter(api_key)

try:
    pinecone_adapter.delete_index("anything")
except:
    pass

# Initialize the PineconeEmbedder class

pinecone_adapter.create_index(dimension=512, metric="cosine")

# bert_model = EmbeddingModel.from_pretrained_hf(
#     WhichModel.Bert, "sentence-transformers/all-MiniLM-L12-v2", revision="main"
# )

clip_model = EmbeddingModel.from_pretrained_hf(
    WhichModel.Clip, "openai/clip-vit-base-patch16", revision="main"
)

embed_config = TextEmbedConfig(chunk_size=512, batch_size=32)


data = embed_anything.embed_image_directory(
    "test_files",
    embedder=clip_model,
    adapter=pinecone_adapter,
    # config=embed_config,

Supported Embedding Models:

  • Text Embedding Models:

    • "OpenAI"
    • "Bert"
    • "Jina"
  • Image Embedding Models:

    • "Clip"
    • "SigLip" (Coming Soon)
  • Audio Embedding Models:

    • "Whisper"

AudioDecoderModel

Represents an audio decoder model.

Attributes:

Name Type Description
model_id str

The ID of the audio decoder model.

revision str

The revision of the audio decoder model.

model_type str

The type of the audio decoder model.

quantized bool

A flag indicating whether the audio decoder model is quantized or not.

Example:

model = embed_anything.AudioDecoderModel.from_pretrained_hf(
    model_id="openai/whisper-tiny.en",
    revision="main",
    model_type="tiny-en",
    quantized=False
)

Source code in python/python/embed_anything/_embed_anything.pyi
class AudioDecoderModel:
    """
    Represents an audio decoder model.

    Attributes:
        model_id: The ID of the audio decoder model.
        revision: The revision of the audio decoder model.
        model_type: The type of the audio decoder model.
        quantized: A flag indicating whether the audio decoder model is quantized or not.

    Example:
    ```python

    model = embed_anything.AudioDecoderModel.from_pretrained_hf(
        model_id="openai/whisper-tiny.en",
        revision="main",
        model_type="tiny-en",
        quantized=False
    )
    ```
    """

    model_id: str
    revision: str
    model_type: str
    quantized: bool

    def from_pretrained_hf(
        model_id: str | None = None,
        revision: str | None = None,
        model_type: str | None = None,
        quantized: bool | None = None,
    ): ...

ColbertModel

Represents the Colbert model.

Source code in python/python/embed_anything/_embed_anything.pyi
class ColbertModel:
    """
    Represents the Colbert model.
    """

    def __init__(
        self,
        hf_model_id: str | None = None,
        revision: str | None = None,
        path_in_repo: str | None = None,
    ):
        """
        Initializes the ColbertModel object.
        """

    def from_pretrained_onnx(
        self,
        hf_model_id: str | None = None,
        revision: str | None = None,
        path_in_repo: str | None = None,
    ) -> ColbertModel:
        """
        Loads a pre-trained Colbert model from the Hugging Face model hub.

        Attributes:
            hf_model_id: The ID of the model from Hugging Face.
            revision: The revision of the model.
            path_in_repo: The path to the model in the repository.

        Returns:
            A ColbertModel object.
        """

    def embed(
        self, text_batch: list[str], batch_size: int | None = None, is_doc: bool = True
    ) -> list[EmbedData]:
        """
        Embeds the given text and returns a list of EmbedData objects.
        """

__init__(hf_model_id=None, revision=None, path_in_repo=None)

Initializes the ColbertModel object.

Source code in python/python/embed_anything/_embed_anything.pyi
def __init__(
    self,
    hf_model_id: str | None = None,
    revision: str | None = None,
    path_in_repo: str | None = None,
):
    """
    Initializes the ColbertModel object.
    """

embed(text_batch, batch_size=None, is_doc=True)

Embeds the given text and returns a list of EmbedData objects.

Source code in python/python/embed_anything/_embed_anything.pyi
def embed(
    self, text_batch: list[str], batch_size: int | None = None, is_doc: bool = True
) -> list[EmbedData]:
    """
    Embeds the given text and returns a list of EmbedData objects.
    """

from_pretrained_onnx(hf_model_id=None, revision=None, path_in_repo=None)

Loads a pre-trained Colbert model from the Hugging Face model hub.

Attributes:

Name Type Description
hf_model_id

The ID of the model from Hugging Face.

revision

The revision of the model.

path_in_repo

The path to the model in the repository.

Returns:

Type Description
ColbertModel

A ColbertModel object.

Source code in python/python/embed_anything/_embed_anything.pyi
def from_pretrained_onnx(
    self,
    hf_model_id: str | None = None,
    revision: str | None = None,
    path_in_repo: str | None = None,
) -> ColbertModel:
    """
    Loads a pre-trained Colbert model from the Hugging Face model hub.

    Attributes:
        hf_model_id: The ID of the model from Hugging Face.
        revision: The revision of the model.
        path_in_repo: The path to the model in the repository.

    Returns:
        A ColbertModel object.
    """

ColpaliModel

Represents the Colpali model.

Source code in python/python/embed_anything/_embed_anything.pyi
class ColpaliModel:
    """
    Represents the Colpali model.
    """

    def __init__(self, model_id: str, revision: str | None = None):
        """
        Initializes the ColpaliModel object.

        Args:
            model_id: The ID of the model from Hugging Face.
            revision: The revision of the model.
        """

    def from_pretrained(model_id: str, revision: str | None = None) -> ColpaliModel:
        """
        Loads a pre-trained Colpali model from the Hugging Face model hub.

        Args:
            model_id: The ID of the model from Hugging Face.
            revision: The revision of the model.

        Returns:
            A ColpaliModel object.
        """

    def from_pretrained_onnx(
        model_id: str, revision: str | None = None
    ) -> ColpaliModel:
        """
        Loads a pre-trained Colpali model from the Hugging Face model hub.

        Args:
            model_id: The ID of the model from Hugging Face.
            revision: The revision of the model.

        Returns:
            A ColpaliModel object.
        """

    def embed_file(self, file_path: str, batch_size: int | None = 1) -> list[EmbedData]:
        """
        Embeds the given pdf file and returns a list of EmbedData objects for each page in the file This first convert the pdf file into images and then embed each image.

        Args:
            file_path: The path to the pdf file to embed.
            batch_size: The batch size for processing the embeddings. Default is 1.

        Returns:
            A list of EmbedData objects for each page in the file.
        """

    def embed_query(self, query: str) -> list[EmbedData]:
        """
        Embeds the given query and returns a list of EmbedData objects.

        Args:
            query: The query to embed.

        Returns:
            A list of EmbedData objects.

        """

__init__(model_id, revision=None)

Initializes the ColpaliModel object.

Parameters:

Name Type Description Default
model_id str

The ID of the model from Hugging Face.

required
revision str | None

The revision of the model.

None
Source code in python/python/embed_anything/_embed_anything.pyi
def __init__(self, model_id: str, revision: str | None = None):
    """
    Initializes the ColpaliModel object.

    Args:
        model_id: The ID of the model from Hugging Face.
        revision: The revision of the model.
    """

embed_file(file_path, batch_size=1)

Embeds the given pdf file and returns a list of EmbedData objects for each page in the file This first convert the pdf file into images and then embed each image.

Parameters:

Name Type Description Default
file_path str

The path to the pdf file to embed.

required
batch_size int | None

The batch size for processing the embeddings. Default is 1.

1

Returns:

Type Description
list[EmbedData]

A list of EmbedData objects for each page in the file.

Source code in python/python/embed_anything/_embed_anything.pyi
def embed_file(self, file_path: str, batch_size: int | None = 1) -> list[EmbedData]:
    """
    Embeds the given pdf file and returns a list of EmbedData objects for each page in the file This first convert the pdf file into images and then embed each image.

    Args:
        file_path: The path to the pdf file to embed.
        batch_size: The batch size for processing the embeddings. Default is 1.

    Returns:
        A list of EmbedData objects for each page in the file.
    """

embed_query(query)

Embeds the given query and returns a list of EmbedData objects.

Parameters:

Name Type Description Default
query str

The query to embed.

required

Returns:

Type Description
list[EmbedData]

A list of EmbedData objects.

Source code in python/python/embed_anything/_embed_anything.pyi
def embed_query(self, query: str) -> list[EmbedData]:
    """
    Embeds the given query and returns a list of EmbedData objects.

    Args:
        query: The query to embed.

    Returns:
        A list of EmbedData objects.

    """

from_pretrained(model_id, revision=None)

Loads a pre-trained Colpali model from the Hugging Face model hub.

Parameters:

Name Type Description Default
model_id str

The ID of the model from Hugging Face.

required
revision str | None

The revision of the model.

None

Returns:

Type Description
ColpaliModel

A ColpaliModel object.

Source code in python/python/embed_anything/_embed_anything.pyi
def from_pretrained(model_id: str, revision: str | None = None) -> ColpaliModel:
    """
    Loads a pre-trained Colpali model from the Hugging Face model hub.

    Args:
        model_id: The ID of the model from Hugging Face.
        revision: The revision of the model.

    Returns:
        A ColpaliModel object.
    """

from_pretrained_onnx(model_id, revision=None)

Loads a pre-trained Colpali model from the Hugging Face model hub.

Parameters:

Name Type Description Default
model_id str

The ID of the model from Hugging Face.

required
revision str | None

The revision of the model.

None

Returns:

Type Description
ColpaliModel

A ColpaliModel object.

Source code in python/python/embed_anything/_embed_anything.pyi
def from_pretrained_onnx(
    model_id: str, revision: str | None = None
) -> ColpaliModel:
    """
    Loads a pre-trained Colpali model from the Hugging Face model hub.

    Args:
        model_id: The ID of the model from Hugging Face.
        revision: The revision of the model.

    Returns:
        A ColpaliModel object.
    """

DocumentRank

Represents the rank of a document.

Attributes:

Name Type Description
document str

The document to rank.

relevance_score float

The relevance score of the document.

rank int

The rank of the document.

Source code in python/python/embed_anything/_embed_anything.pyi
class DocumentRank:
    """
    Represents the rank of a document.

    Attributes:
        document: The document to rank.
        relevance_score: The relevance score of the document.
        rank: The rank of the document.
    """

    document: str
    relevance_score: float
    rank: int

Dtype

Bases: Enum

Represents the data type of the model.

Source code in python/python/embed_anything/_embed_anything.pyi
class Dtype(Enum):
    """
    Represents the data type of the model.
    """

    F16 = "F16"
    INT8 = "INT8"
    Q4 = "Q4"
    UINT8 = "UINT8"
    BNB4 = "BNB4"
    Q4F16 = "Q4F16"

EmbedData

Represents the data of an embedded file.

Attributes:

Name Type Description
embedding list[float]

The embedding of the file.

text str

The text for which the embedding is generated for.

metadata dict[str, str]

Additional metadata associated with the embedding.

Source code in python/python/embed_anything/_embed_anything.pyi
class EmbedData:
    """Represents the data of an embedded file.

    Attributes:
        embedding: The embedding of the file.
        text: The text for which the embedding is generated for.
        metadata: Additional metadata associated with the embedding.
    """

    def __init__(self, embedding: list[float], text: str, metadata: dict[str, str]):
        self.embedding = embedding
        self.text = text
        self.metadata = metadata
    embedding: list[float]
    text: str
    metadata: dict[str, str]

EmbeddingModel

Represents an embedding model.

Source code in python/python/embed_anything/_embed_anything.pyi
class EmbeddingModel:
    """
    Represents an embedding model.
    """

    def from_pretrained_hf(
        model: WhichModel, model_id: str, revision: str | None = None
    ) -> EmbeddingModel:
        """
        Loads an embedding model from the Hugging Face model hub.

        Attributes:
            model_id: The ID of the model.
            revision: The revision of the model.

        Returns:
            An EmbeddingModel object.

        Example:
        ```python
        model = EmbeddingModel.from_pretrained_hf(
            model_id="sentence-transformers/all-MiniLM-L6-v2",
            revision="main"
        )
        ```

        """

    def from_pretrained_cloud(
        model: WhichModel, model_id: str, api_key: str | None = None
    ) -> EmbeddingModel:
        """
        Loads an embedding model from a cloud-based service.

        Attributes:
            model (WhichModel): The cloud service to use. Currently supports WhichModel.OpenAI and WhichModel.Cohere.
            model_id (str): The ID of the model to use.
                - For OpenAI, see available models at https://platform.openai.com/docs/guides/embeddings/embedding-models
                - For Cohere, see available models at https://docs.cohere.com/docs/cohere-embed
            api_key (str | None, optional): The API key for accessing the model. If not provided, it is taken from the environment variable:
                - For OpenAI: OPENAI_API_KEY
                - For Cohere: CO_API_KEY

        Returns:
            EmbeddingModel: An initialized EmbeddingModel object.

        Raises:
            ValueError: If an unsupported model is specified.

        Example:
        ```python
        # Using Cohere
        model = EmbeddingModel.from_pretrained_cloud(
            model=WhichModel.Cohere,
            model_id="embed-english-v3.0"
        )

        # Using OpenAI
        model = EmbeddingModel.from_pretrained_cloud(
            model=WhichModel.OpenAI,
            model_id="text-embedding-3-small"
        )
        ```
        """

    def from_pretrained_onnx(
        model: WhichModel,
        model_name: Optional[ONNXModel] | None = None,
        hf_model_id: Optional[str] | None = None,
        revision: Optional[str] | None = None,
        dtype: Optional[Dtype] | None = None,
        path_in_repo: Optional[str] | None = None,
    ) -> EmbeddingModel:
        """
        Loads an ONNX embedding model.

        Args:
            model (WhichModel): The architecture of the embedding model to use.
            model_name (ONNXModel | None, optional): The name of the model. Defaults to None.
            hf_model_id (str | None, optional): The ID of the model from Hugging Face. Defaults to None.
            revision (str | None, optional): The revision of the model. Defaults to None.
            dtype (Dtype | None, optional): The dtype of the model. Defaults to None.
            path_in_repo (str | None, optional): The path to the model in the repository. Defaults to None.
        Returns:
            EmbeddingModel: An initialized EmbeddingModel object.

        Atleast one of the following arguments must be provided:
            - model_name
            - hf_model_id

        If hf_model_id is provided, dtype is ignored and the path_in_repo has to be provided pointing to the model file in the repository.
        If model_name is provided, dtype is used to determine the model file to load.

        Example:
        ```python
        model = EmbeddingModel.from_pretrained_onnx(
            model=WhichModel.Bert,
            model_name=ONNXModel.BGESmallENV15Q,
            dtype=Dtype.Q4F16
        )

        model = EmbeddingModel.from_pretrained_onnx(
            model=WhichModel.Bert,
            hf_model_id="jinaai/jina-embeddings-v3",
            path_in_repo="onnx/model_fp16.onnx"
        )
        ```

        Note:
        This method loads a pre-trained model in ONNX format, which can offer improved inference speed
        compared to standard PyTorch models. ONNX models are particularly useful for deployment
        scenarios where performance is critical.
        """

from_pretrained_cloud(model, model_id, api_key=None)

Loads an embedding model from a cloud-based service.

Attributes:

Name Type Description
model WhichModel

The cloud service to use. Currently supports WhichModel.OpenAI and WhichModel.Cohere.

model_id str

The ID of the model to use. - For OpenAI, see available models at https://platform.openai.com/docs/guides/embeddings/embedding-models - For Cohere, see available models at https://docs.cohere.com/docs/cohere-embed

api_key str | None

The API key for accessing the model. If not provided, it is taken from the environment variable: - For OpenAI: OPENAI_API_KEY - For Cohere: CO_API_KEY

Returns:

Name Type Description
EmbeddingModel EmbeddingModel

An initialized EmbeddingModel object.

Raises:

Type Description
ValueError

If an unsupported model is specified.

Example:

# Using Cohere
model = EmbeddingModel.from_pretrained_cloud(
    model=WhichModel.Cohere,
    model_id="embed-english-v3.0"
)

# Using OpenAI
model = EmbeddingModel.from_pretrained_cloud(
    model=WhichModel.OpenAI,
    model_id="text-embedding-3-small"
)

Source code in python/python/embed_anything/_embed_anything.pyi
def from_pretrained_cloud(
    model: WhichModel, model_id: str, api_key: str | None = None
) -> EmbeddingModel:
    """
    Loads an embedding model from a cloud-based service.

    Attributes:
        model (WhichModel): The cloud service to use. Currently supports WhichModel.OpenAI and WhichModel.Cohere.
        model_id (str): The ID of the model to use.
            - For OpenAI, see available models at https://platform.openai.com/docs/guides/embeddings/embedding-models
            - For Cohere, see available models at https://docs.cohere.com/docs/cohere-embed
        api_key (str | None, optional): The API key for accessing the model. If not provided, it is taken from the environment variable:
            - For OpenAI: OPENAI_API_KEY
            - For Cohere: CO_API_KEY

    Returns:
        EmbeddingModel: An initialized EmbeddingModel object.

    Raises:
        ValueError: If an unsupported model is specified.

    Example:
    ```python
    # Using Cohere
    model = EmbeddingModel.from_pretrained_cloud(
        model=WhichModel.Cohere,
        model_id="embed-english-v3.0"
    )

    # Using OpenAI
    model = EmbeddingModel.from_pretrained_cloud(
        model=WhichModel.OpenAI,
        model_id="text-embedding-3-small"
    )
    ```
    """

from_pretrained_hf(model, model_id, revision=None)

Loads an embedding model from the Hugging Face model hub.

Attributes:

Name Type Description
model_id

The ID of the model.

revision

The revision of the model.

Returns:

Type Description
EmbeddingModel

An EmbeddingModel object.

Example:

model = EmbeddingModel.from_pretrained_hf(
    model_id="sentence-transformers/all-MiniLM-L6-v2",
    revision="main"
)

Source code in python/python/embed_anything/_embed_anything.pyi
def from_pretrained_hf(
    model: WhichModel, model_id: str, revision: str | None = None
) -> EmbeddingModel:
    """
    Loads an embedding model from the Hugging Face model hub.

    Attributes:
        model_id: The ID of the model.
        revision: The revision of the model.

    Returns:
        An EmbeddingModel object.

    Example:
    ```python
    model = EmbeddingModel.from_pretrained_hf(
        model_id="sentence-transformers/all-MiniLM-L6-v2",
        revision="main"
    )
    ```

    """

from_pretrained_onnx(model, model_name=None, hf_model_id=None, revision=None, dtype=None, path_in_repo=None)

Loads an ONNX embedding model.

Parameters:

Name Type Description Default
model WhichModel

The architecture of the embedding model to use.

required
model_name ONNXModel | None

The name of the model. Defaults to None.

None
hf_model_id str | None

The ID of the model from Hugging Face. Defaults to None.

None
revision str | None

The revision of the model. Defaults to None.

None
dtype Dtype | None

The dtype of the model. Defaults to None.

None
path_in_repo str | None

The path to the model in the repository. Defaults to None.

None

Returns: EmbeddingModel: An initialized EmbeddingModel object.

Atleast one of the following arguments must be provided
  • model_name
  • hf_model_id

If hf_model_id is provided, dtype is ignored and the path_in_repo has to be provided pointing to the model file in the repository. If model_name is provided, dtype is used to determine the model file to load.

Example:

model = EmbeddingModel.from_pretrained_onnx(
    model=WhichModel.Bert,
    model_name=ONNXModel.BGESmallENV15Q,
    dtype=Dtype.Q4F16
)

model = EmbeddingModel.from_pretrained_onnx(
    model=WhichModel.Bert,
    hf_model_id="jinaai/jina-embeddings-v3",
    path_in_repo="onnx/model_fp16.onnx"
)

Note: This method loads a pre-trained model in ONNX format, which can offer improved inference speed compared to standard PyTorch models. ONNX models are particularly useful for deployment scenarios where performance is critical.

Source code in python/python/embed_anything/_embed_anything.pyi
def from_pretrained_onnx(
    model: WhichModel,
    model_name: Optional[ONNXModel] | None = None,
    hf_model_id: Optional[str] | None = None,
    revision: Optional[str] | None = None,
    dtype: Optional[Dtype] | None = None,
    path_in_repo: Optional[str] | None = None,
) -> EmbeddingModel:
    """
    Loads an ONNX embedding model.

    Args:
        model (WhichModel): The architecture of the embedding model to use.
        model_name (ONNXModel | None, optional): The name of the model. Defaults to None.
        hf_model_id (str | None, optional): The ID of the model from Hugging Face. Defaults to None.
        revision (str | None, optional): The revision of the model. Defaults to None.
        dtype (Dtype | None, optional): The dtype of the model. Defaults to None.
        path_in_repo (str | None, optional): The path to the model in the repository. Defaults to None.
    Returns:
        EmbeddingModel: An initialized EmbeddingModel object.

    Atleast one of the following arguments must be provided:
        - model_name
        - hf_model_id

    If hf_model_id is provided, dtype is ignored and the path_in_repo has to be provided pointing to the model file in the repository.
    If model_name is provided, dtype is used to determine the model file to load.

    Example:
    ```python
    model = EmbeddingModel.from_pretrained_onnx(
        model=WhichModel.Bert,
        model_name=ONNXModel.BGESmallENV15Q,
        dtype=Dtype.Q4F16
    )

    model = EmbeddingModel.from_pretrained_onnx(
        model=WhichModel.Bert,
        hf_model_id="jinaai/jina-embeddings-v3",
        path_in_repo="onnx/model_fp16.onnx"
    )
    ```

    Note:
    This method loads a pre-trained model in ONNX format, which can offer improved inference speed
    compared to standard PyTorch models. ONNX models are particularly useful for deployment
    scenarios where performance is critical.
    """

ImageEmbedConfig

Represents the configuration for the Image Embedding model.

Attributes:

Name Type Description
buffer_size int | None

The buffer size for the Image Embedding model. Default is 100.

Source code in python/python/embed_anything/_embed_anything.pyi
class ImageEmbedConfig:
    """
    Represents the configuration for the Image Embedding model.

    Attributes:
        buffer_size: The buffer size for the Image Embedding model. Default is 100.
    """

    def __init__(self, buffer_size: int | None = None):
        self.buffer_size = buffer_size
    buffer_size: int | None

ONNXModel

Bases: Enum

Enum representing various ONNX models.

| Enum Variant                     | Description                                      |
|----------------------------------|--------------------------------------------------|
| `AllMiniLML6V2`                  | sentence-transformers/all-MiniLM-L6-v2           |
| `AllMiniLML6V2Q`                 | Quantized sentence-transformers/all-MiniLM-L6-v2 |
| `AllMiniLML12V2`                 | sentence-transformers/all-MiniLM-L12-v2          |
| `AllMiniLML12V2Q`                | Quantized sentence-transformers/all-MiniLM-L12-v2|
| `ModernBERTBase`                 | nomic-ai/modernbert-embed-base                   |
| `ModernBERTLarge`                | nomic-ai/modernbert-embed-large                  |
| `BGEBaseENV15`                   | BAAI/bge-base-en-v1.5                            |
| `BGEBaseENV15Q`                  | Quantized BAAI/bge-base-en-v1.5                  |
| `BGELargeENV15`                  | BAAI/bge-large-en-v1.5                           |
| `BGELargeENV15Q`                 | Quantized BAAI/bge-large-en-v1.5                 |
| `BGESmallENV15`                  | BAAI/bge-small-en-v1.5 - Default                 |
| `BGESmallENV15Q`                 | Quantized BAAI/bge-small-en-v1.5                 |
| `NomicEmbedTextV1`               | nomic-ai/nomic-embed-text-v1                     |
| `NomicEmbedTextV15`              | nomic-ai/nomic-embed-text-v1.5                   |
| `NomicEmbedTextV15Q`             | Quantized nomic-ai/nomic-embed-text-v1.5         |
| `ParaphraseMLMiniLML12V2`        | sentence-transformers/paraphrase-MiniLM-L6-v2    |
| `ParaphraseMLMiniLML12V2Q`       | Quantized sentence-transformers/paraphrase-MiniLM-L6-v2 |
| `ParaphraseMLMpnetBaseV2`        | sentence-transformers/paraphrase-mpnet-base-v2   |
| `BGESmallZHV15`                  | BAAI/bge-small-zh-v1.5                           |
| `MultilingualE5Small`            | intfloat/multilingual-e5-small                   |
| `MultilingualE5Base`             | intfloat/multilingual-e5-base                    |
| `MultilingualE5Large`            | intfloat/multilingual-e5-large                   |
| `MxbaiEmbedLargeV1`              | mixedbread-ai/mxbai-embed-large-v1               |
| `MxbaiEmbedLargeV1Q`             | Quantized mixedbread-ai/mxbai-embed-large-v1     |
| `GTEBaseENV15`                   | Alibaba-NLP/gte-base-en-v1.5                     |
| `GTEBaseENV15Q`                  | Quantized Alibaba-NLP/gte-base-en-v1.5           |
| `GTELargeENV15`                  | Alibaba-NLP/gte-large-en-v1.5                    |
| `GTELargeENV15Q`                 | Quantized Alibaba-NLP/gte-large-en-v1.5          |
| `JINAV2SMALLEN`                  | jinaai/jina-embeddings-v2-small-en               |
| `JINAV2BASEEN`                   | jinaai/jina-embeddings-v2-base-en                |
| `JINAV3`                         | jinaai/jina-embeddings-v3                        |
| `SPLADEPPENV1`                   | prithivida/Splade_PP_en_v1                      |
| `SPLADEPPENV2`                   | prithivida/Splade_PP_en_v2                      |
Source code in python/python/embed_anything/_embed_anything.pyi
class ONNXModel(Enum):
    """
    Enum representing various ONNX models.

    ```markdown
    | Enum Variant                     | Description                                      |
    |----------------------------------|--------------------------------------------------|
    | `AllMiniLML6V2`                  | sentence-transformers/all-MiniLM-L6-v2           |
    | `AllMiniLML6V2Q`                 | Quantized sentence-transformers/all-MiniLM-L6-v2 |
    | `AllMiniLML12V2`                 | sentence-transformers/all-MiniLM-L12-v2          |
    | `AllMiniLML12V2Q`                | Quantized sentence-transformers/all-MiniLM-L12-v2|
    | `ModernBERTBase`                 | nomic-ai/modernbert-embed-base                   |
    | `ModernBERTLarge`                | nomic-ai/modernbert-embed-large                  |
    | `BGEBaseENV15`                   | BAAI/bge-base-en-v1.5                            |
    | `BGEBaseENV15Q`                  | Quantized BAAI/bge-base-en-v1.5                  |
    | `BGELargeENV15`                  | BAAI/bge-large-en-v1.5                           |
    | `BGELargeENV15Q`                 | Quantized BAAI/bge-large-en-v1.5                 |
    | `BGESmallENV15`                  | BAAI/bge-small-en-v1.5 - Default                 |
    | `BGESmallENV15Q`                 | Quantized BAAI/bge-small-en-v1.5                 |
    | `NomicEmbedTextV1`               | nomic-ai/nomic-embed-text-v1                     |
    | `NomicEmbedTextV15`              | nomic-ai/nomic-embed-text-v1.5                   |
    | `NomicEmbedTextV15Q`             | Quantized nomic-ai/nomic-embed-text-v1.5         |
    | `ParaphraseMLMiniLML12V2`        | sentence-transformers/paraphrase-MiniLM-L6-v2    |
    | `ParaphraseMLMiniLML12V2Q`       | Quantized sentence-transformers/paraphrase-MiniLM-L6-v2 |
    | `ParaphraseMLMpnetBaseV2`        | sentence-transformers/paraphrase-mpnet-base-v2   |
    | `BGESmallZHV15`                  | BAAI/bge-small-zh-v1.5                           |
    | `MultilingualE5Small`            | intfloat/multilingual-e5-small                   |
    | `MultilingualE5Base`             | intfloat/multilingual-e5-base                    |
    | `MultilingualE5Large`            | intfloat/multilingual-e5-large                   |
    | `MxbaiEmbedLargeV1`              | mixedbread-ai/mxbai-embed-large-v1               |
    | `MxbaiEmbedLargeV1Q`             | Quantized mixedbread-ai/mxbai-embed-large-v1     |
    | `GTEBaseENV15`                   | Alibaba-NLP/gte-base-en-v1.5                     |
    | `GTEBaseENV15Q`                  | Quantized Alibaba-NLP/gte-base-en-v1.5           |
    | `GTELargeENV15`                  | Alibaba-NLP/gte-large-en-v1.5                    |
    | `GTELargeENV15Q`                 | Quantized Alibaba-NLP/gte-large-en-v1.5          |
    | `JINAV2SMALLEN`                  | jinaai/jina-embeddings-v2-small-en               |
    | `JINAV2BASEEN`                   | jinaai/jina-embeddings-v2-base-en                |
    | `JINAV3`                         | jinaai/jina-embeddings-v3                        |
    | `SPLADEPPENV1`                   | prithivida/Splade_PP_en_v1                      |
    | `SPLADEPPENV2`                   | prithivida/Splade_PP_en_v2                      |
    ```
    """

    AllMiniLML6V2 = "AllMiniLML6V2"

    AllMiniLML6V2Q = "AllMiniLML6V2Q"

    AllMiniLML12V2 = "AllMiniLML12V2"

    AllMiniLML12V2Q = "AllMiniLML12V2Q"

    ModernBERTBase = "ModernBERTBase"

    ModernBERTLarge = "ModernBERTLarge"

    BGEBaseENV15 = "BGEBaseENV15"

    BGEBaseENV15Q = "BGEBaseENV15Q"

    BGELargeENV15 = "BGELargeENV15"

    BGELargeENV15Q = "BGELargeENV15Q"

    BGESmallENV15 = "BGESmallENV15"

    BGESmallENV15Q = "BGESmallENV15Q"

    NomicEmbedTextV1 = "NomicEmbedTextV1"

    NomicEmbedTextV15 = "NomicEmbedTextV15"

    NomicEmbedTextV15Q = "NomicEmbedTextV15Q"

    ParaphraseMLMiniLML12V2 = "ParaphraseMLMiniLML12V2"

    ParaphraseMLMiniLML12V2Q = "ParaphraseMLMiniLML12V2Q"

    ParaphraseMLMpnetBaseV2 = "ParaphraseMLMpnetBaseV2"

    BGESmallZHV15 = "BGESmallZHV15"

    MultilingualE5Small = "MultilingualE5Small"

    MultilingualE5Base = "MultilingualE5Base"

    MultilingualE5Large = "MultilingualE5Large"

    MxbaiEmbedLargeV1 = "MxbaiEmbedLargeV1"

    MxbaiEmbedLargeV1Q = "MxbaiEmbedLargeV1Q"

    GTEBaseENV15 = "GTEBaseENV15"

    GTEBaseENV15Q = "GTEBaseENV15Q"

    GTELargeENV15 = "GTELargeENV15"

    GTELargeENV15Q = "GTELargeENV15Q"

    JINAV2SMALLEN = "JINAV2SMALLEN"

    JINAV2BASEEN = "JINAV2BASEEN"

    JINAV3 = "JINAV3"

    SPLADEPPENV1 = "SPLADEPPENV1"

    SPLADEPPENV2 = "SPLADEPPENV2"

Reranker

Represents the Reranker model.

Source code in python/python/embed_anything/_embed_anything.pyi
class Reranker:
    """
    Represents the Reranker model.
    """

    def __init__(
        self, model_id: str, revision: str | None = None, dtype: Dtype | None = None
    ):
        """
        Initializes the Reranker object.
        """

    def from_pretrained(
        model_id: str, revision: str | None = None, dtype: Dtype | None = None
    ) -> Reranker:
        """
        Loads a pre-trained Reranker model from the Hugging Face model hub.
        """

    def rerank(
        self, query: list[str], documents: list[str], top_k: int
    ) -> RerankerResult:
        """
        Reranks the given documents for the query and returns a list of RerankerResult objects.
        """

__init__(model_id, revision=None, dtype=None)

Initializes the Reranker object.

Source code in python/python/embed_anything/_embed_anything.pyi
def __init__(
    self, model_id: str, revision: str | None = None, dtype: Dtype | None = None
):
    """
    Initializes the Reranker object.
    """

from_pretrained(model_id, revision=None, dtype=None)

Loads a pre-trained Reranker model from the Hugging Face model hub.

Source code in python/python/embed_anything/_embed_anything.pyi
def from_pretrained(
    model_id: str, revision: str | None = None, dtype: Dtype | None = None
) -> Reranker:
    """
    Loads a pre-trained Reranker model from the Hugging Face model hub.
    """

rerank(query, documents, top_k)

Reranks the given documents for the query and returns a list of RerankerResult objects.

Source code in python/python/embed_anything/_embed_anything.pyi
def rerank(
    self, query: list[str], documents: list[str], top_k: int
) -> RerankerResult:
    """
    Reranks the given documents for the query and returns a list of RerankerResult objects.
    """

RerankerResult

Represents the result of the reranking process.

Attributes:

Name Type Description
query str

The query to rerank.

documents list[DocumentRank]

The list of documents to rerank.

Source code in python/python/embed_anything/_embed_anything.pyi
class RerankerResult:
    """
    Represents the result of the reranking process.

    Attributes:
        query: The query to rerank.
        documents: The list of documents to rerank.
    """

    query: str
    documents: list[DocumentRank]

TextEmbedConfig

Represents the configuration for the Text Embedding model.

Attributes:

Name Type Description
chunk_size int | None

The chunk size for the Text Embedding model.

batch_size int | None

The batch size for processing the embeddings. Default is 32. Based on the memory, you can increase or decrease the batch size.

splitting_strategy str | None

The strategy to use for splitting the text into chunks. Default is "sentence".

semantic_encoder EmbeddingModel | None

The semantic encoder for the Text Embedding model. Default is None.

use_ocr bool | None

A flag indicating whether to use OCR for the Text Embedding model. Default is False.

Source code in python/python/embed_anything/_embed_anything.pyi
class TextEmbedConfig:
    """
    Represents the configuration for the Text Embedding model.

    Attributes:
        chunk_size: The chunk size for the Text Embedding model.
        batch_size: The batch size for processing the embeddings. Default is 32. Based on the memory, you can increase or decrease the batch size.
        splitting_strategy: The strategy to use for splitting the text into chunks. Default is "sentence".
        semantic_encoder: The semantic encoder for the Text Embedding model. Default is None.
        use_ocr: A flag indicating whether to use OCR for the Text Embedding model. Default is False.
    """

    def __init__(
        self,
        chunk_size: int | None = 256,
        overlap_ratio: float | None = 0.0,
        batch_size: int | None = 32,
        buffer_size: int | None = 100,
        splitting_strategy: str | None = "sentence",
        semantic_encoder: EmbeddingModel | None = None,
        use_ocr: bool | None = False,
    ):
        self.chunk_size = chunk_size
        self.overlap_ratio = overlap_ratio
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.splitting_strategy = splitting_strategy
        self.semantic_encoder = semantic_encoder
        self.use_ocr = use_ocr
    chunk_size: int | None
    overlap_ratio: float | None
    batch_size: int | None
    buffer_size: int | None
    splitting_strategy: str | None
    semantic_encoder: EmbeddingModel | None
    use_ocr: bool | None

embed_audio_file(file_path, audio_decoder, embedder, text_embed_config=TextEmbedConfig(chunk_size=200, batch_size=32))

Embeds the given audio file and returns a list of EmbedData objects.

Parameters:

Name Type Description Default
file_path str

The path to the audio file to embed.

required
audio_decoder AudioDecoderModel

The audio decoder model to use.

required
embedder EmbeddingModel

The embedding model to use.

required
text_embed_config TextEmbedConfig | None

The configuration for the embedding model.

TextEmbedConfig(chunk_size=200, batch_size=32)

Returns:

Type Description
list[EmbedData]

A list of EmbedData objects.

Example:

import embed_anything
audio_decoder = embed_anything.AudioDecoderModel.from_pretrained_hf(
    "openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False
)

embedder = embed_anything.EmbeddingModel.from_pretrained_hf(
    embed_anything.WhichModel.Bert,
    model_id="sentence-transformers/all-MiniLM-L6-v2",
    revision="main",
)

config = embed_anything.TextEmbedConfig(chunk_size=200, batch_size=32)
data = embed_anything.embed_audio_file(
    "test_files/audio/samples_hp0.wav",
    audio_decoder=audio_decoder,
    embedder=embedder,
    text_embed_config=config,
)

Source code in python/python/embed_anything/_embed_anything.pyi
def embed_audio_file(
    file_path: str,
    audio_decoder: AudioDecoderModel,
    embedder: EmbeddingModel,
    text_embed_config: TextEmbedConfig | None = TextEmbedConfig(
        chunk_size=200, batch_size=32
    ),
) -> list[EmbedData]:
    """
    Embeds the given audio file and returns a list of EmbedData objects.

    Args:
        file_path: The path to the audio file to embed.
        audio_decoder: The audio decoder model to use.
        embedder: The embedding model to use.
        text_embed_config: The configuration for the embedding model.

    Returns:
        A list of EmbedData objects.

    Example:
    ```python

    import embed_anything
    audio_decoder = embed_anything.AudioDecoderModel.from_pretrained_hf(
        "openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False
    )

    embedder = embed_anything.EmbeddingModel.from_pretrained_hf(
        embed_anything.WhichModel.Bert,
        model_id="sentence-transformers/all-MiniLM-L6-v2",
        revision="main",
    )

    config = embed_anything.TextEmbedConfig(chunk_size=200, batch_size=32)
    data = embed_anything.embed_audio_file(
        "test_files/audio/samples_hp0.wav",
        audio_decoder=audio_decoder,
        embedder=embedder,
        text_embed_config=config,
    )
    ```

    """

embed_directory(file_path, embedder, extensions, config=None, adapter=None)

Embeds the files in the given directory and returns a list of EmbedData objects.

Parameters:

Name Type Description Default
file_path str

The path to the directory containing the files to embed.

required
embedder EmbeddingModel

The embedding model to use.

required
extensions list[str]

The list of file extensions to consider for embedding.

required
config TextEmbedConfig | None

The configuration for the embedding model.

None
adapter Adapter | None

The adapter to use for storing the embeddings in a vector database.

None

Returns:

Type Description
list[EmbedData]

A list of EmbedData objects.

Example:

import embed_anything
model = embed_anything.EmbeddingModel.from_pretrained_hf(
    embed_anything.WhichModel.Bert,
    model_id="sentence-transformers/all-MiniLM-L6-v2",
    revision="main",
)
data = embed_anything.embed_directory("test_files", embedder=model, extensions=[".pdf"])

Source code in python/python/embed_anything/_embed_anything.pyi
def embed_directory(
    file_path: str,
    embedder: EmbeddingModel,
    extensions: list[str],
    config: TextEmbedConfig | None = None,
    adapter: Adapter | None = None,
) -> list[EmbedData]:
    """
    Embeds the files in the given directory and returns a list of EmbedData objects.

    Args:
        file_path: The path to the directory containing the files to embed.
        embedder: The embedding model to use.
        extensions: The list of file extensions to consider for embedding.
        config: The configuration for the embedding model.
        adapter: The adapter to use for storing the embeddings in a vector database.

    Returns:
        A list of EmbedData objects.

    Example:
    ```python
    import embed_anything
    model = embed_anything.EmbeddingModel.from_pretrained_hf(
        embed_anything.WhichModel.Bert,
        model_id="sentence-transformers/all-MiniLM-L6-v2",
        revision="main",
    )
    data = embed_anything.embed_directory("test_files", embedder=model, extensions=[".pdf"])
    ```
    """

embed_file(file_path, embedder, config=None, adapter=None)

Embeds the given file and returns a list of EmbedData objects.

Parameters:

Name Type Description Default
file_path str

The path to the file to embed.

required
embedder EmbeddingModel

The embedding model to use.

required
config TextEmbedConfig | None

The configuration for the embedding model.

None
adapter Adapter | None

The adapter to use for storing the embeddings in a vector database.

None

Returns:

Type Description
list[EmbedData]

A list of EmbedData objects.

Example:

import embed_anything
model = embed_anything.EmbeddingModel.from_pretrained_hf(
    embed_anything.WhichModel.Bert,
    model_id="sentence-transformers/all-MiniLM-L6-v2",
    revision="main",
)
data = embed_anything.embed_file("test_files/test.pdf", embedder=model)

Source code in python/python/embed_anything/_embed_anything.pyi
def embed_file(
    file_path: str,
    embedder: EmbeddingModel,
    config: TextEmbedConfig | None = None,
    adapter: Adapter | None = None,
) -> list[EmbedData]:
    """
    Embeds the given file and returns a list of EmbedData objects.

    Args:
        file_path: The path to the file to embed.
        embedder: The embedding model to use.
        config: The configuration for the embedding model.
        adapter: The adapter to use for storing the embeddings in a vector database.

    Returns:
        A list of EmbedData objects.

    Example:
    ```python
    import embed_anything
    model = embed_anything.EmbeddingModel.from_pretrained_hf(
        embed_anything.WhichModel.Bert,
        model_id="sentence-transformers/all-MiniLM-L6-v2",
        revision="main",
    )
    data = embed_anything.embed_file("test_files/test.pdf", embedder=model)
    ```
    """

embed_image_directory(file_path, embedder, config=None, adapter=None)

Embeds the images in the given directory and returns a list of EmbedData objects.

Parameters:

Name Type Description Default
file_path str

The path to the directory containing the images to embed.

required
embedder EmbeddingModel

The embedding model to use.

required
config ImageEmbedConfig | None

The configuration for the embedding model.

None
adapter Adapter | None

The adapter to use for storing the embeddings in a vector database.

None

Returns:

Type Description
list[EmbedData]

A list of EmbedData objects.

Source code in python/python/embed_anything/_embed_anything.pyi
def embed_image_directory(
    file_path: str,
    embedder: EmbeddingModel,
    config: ImageEmbedConfig | None = None,
    adapter: Adapter | None = None,
) -> list[EmbedData]:
    """
    Embeds the images in the given directory and returns a list of EmbedData objects.

    Args:
        file_path: The path to the directory containing the images to embed.
        embedder: The embedding model to use.
        config: The configuration for the embedding model.
        adapter: The adapter to use for storing the embeddings in a vector database.

    Returns:
        A list of EmbedData objects.
    """

embed_query(query, embedder, config=None)

Embeds the given query and returns a list of EmbedData objects.

Parameters:

Name Type Description Default
query list[str]

The query to embed.

required
embedder EmbeddingModel

The embedding model to use.

required
config TextEmbedConfig | None

The configuration for the embedding model.

None

Returns:

Type Description
list[EmbedData]

A list of EmbedData objects.

Example:

import embed_anything
model = embed_anything.EmbeddingModel.from_pretrained_hf(
    embed_anything.WhichModel.Bert,
    model_id="sentence-transformers/all-MiniLM-L6-v2",
    revision="main",
)
Source code in python/python/embed_anything/_embed_anything.pyi
def embed_query(
    query: list[str], embedder: EmbeddingModel, config: TextEmbedConfig | None = None
) -> list[EmbedData]:
    """
    Embeds the given query and returns a list of EmbedData objects.

    Args:
        query: The query to embed.
        embedder: The embedding model to use.
        config: The configuration for the embedding model.

    Returns:
        A list of EmbedData objects.

    Example:

    ```python
    import embed_anything
    model = embed_anything.EmbeddingModel.from_pretrained_hf(
        embed_anything.WhichModel.Bert,
        model_id="sentence-transformers/all-MiniLM-L6-v2",
        revision="main",
    )
    ```
    """

embed_webpage(url, embedder, config, adapter)

Embeds the webpage at the given URL and returns a list of EmbedData objects.

Parameters:

Name Type Description Default
url str

The URL of the webpage to embed.

required
embedder EmbeddingModel

The name of the embedding model to use. Choose between "OpenAI", "Jina", "Bert"

required
config TextEmbedConfig | None

The configuration for the embedding model.

required
adapter Adapter | None

The adapter to use for storing the embeddings.

required

Returns:

Type Description
list[EmbedData] | None

A list of EmbedData objects

Example:

import embed_anything

config = embed_anything.EmbedConfig(
    openai_config=embed_anything.OpenAIConfig(model="text-embedding-3-small")
)
data = embed_anything.embed_webpage(
    "https://www.akshaymakes.com/", embedder="OpenAI", config=config
)

Source code in python/python/embed_anything/_embed_anything.pyi
def embed_webpage(
    url: str,
    embedder: EmbeddingModel,
    config: TextEmbedConfig | None,
    adapter: Adapter | None,
) -> list[EmbedData] | None:
    """Embeds the webpage at the given URL and returns a list of EmbedData
    objects.

    Args:
        url: The URL of the webpage to embed.
        embedder: The name of the embedding model to use. Choose between "OpenAI", "Jina", "Bert"
        config: The configuration for the embedding model.
        adapter: The adapter to use for storing the embeddings.

    Returns:
        A list of EmbedData objects

    Example:
    ```python
    import embed_anything

    config = embed_anything.EmbedConfig(
        openai_config=embed_anything.OpenAIConfig(model="text-embedding-3-small")
    )
    data = embed_anything.embed_webpage(
        "https://www.akshaymakes.com/", embedder="OpenAI", config=config
    )
    ```
    """