Skip to content

Retrieval

SemanticRetriever

Bases: BaseModel

SemanticRetriever class for retrieving documents based on embeddings.

Parameters:

Name Type Description Default
embedding_model Any

The embedding model used to encode the corpus.

required
vector_db Collection

The Chroma vector database.

required
Source code in docqa/core/retrieval.py
class SemanticRetriever(BaseModel):
    """
    SemanticRetriever class for retrieving documents based on embeddings.

    Args:
        embedding_model (Any): The embedding model used to encode the corpus.
        vector_db (chromadb.Collection): The Chroma vector database.
    """

    class Config:
        arbitrary_types_allowed = True

    embedding_model: AnglE
    vector_db: chromadb.Collection

    def process(
        self, query: str, top_k: int, metadata_filter: dict | None = None
    ) -> list[dict]:
        """
        Process the given query to retrieve the top-k results from the vector database.

        Args:
            query (str): The query string.
            top_k (int): The number of results to retrieve.
            metadata_filter (dict | None, optional): A dictionary specifying metadata
                filters. Defaults to None.

        Returns:
            list[dict]: The list of retrieved results.
        """
        query_embeddings = self.embedding_model.encode({"text": query})

        results = self.vector_db.query(
            query_embeddings=query_embeddings, n_results=top_k, where=metadata_filter
        )

        output = []
        for i in range(len(results["ids"][0])):
            score = 1 - results["distances"][0][i]
            document = results["documents"][0][i]
            metadata = results["metadatas"][0][i]
            output.append({"score": score, "document": document, "metadata": metadata})

        return output

process

process(query, top_k, metadata_filter=None)

Process the given query to retrieve the top-k results from the vector database.

Parameters:

Name Type Description Default
query str

The query string.

required
top_k int

The number of results to retrieve.

required
metadata_filter dict | None

A dictionary specifying metadata filters. Defaults to None.

None

Returns:

Type Description
list[dict]

list[dict]: The list of retrieved results.

Source code in docqa/core/retrieval.py
def process(
    self, query: str, top_k: int, metadata_filter: dict | None = None
) -> list[dict]:
    """
    Process the given query to retrieve the top-k results from the vector database.

    Args:
        query (str): The query string.
        top_k (int): The number of results to retrieve.
        metadata_filter (dict | None, optional): A dictionary specifying metadata
            filters. Defaults to None.

    Returns:
        list[dict]: The list of retrieved results.
    """
    query_embeddings = self.embedding_model.encode({"text": query})

    results = self.vector_db.query(
        query_embeddings=query_embeddings, n_results=top_k, where=metadata_filter
    )

    output = []
    for i in range(len(results["ids"][0])):
        score = 1 - results["distances"][0][i]
        document = results["documents"][0][i]
        metadata = results["metadatas"][0][i]
        output.append({"score": score, "document": document, "metadata": metadata})

    return output