Data generation

QAPairGenerator ¶

Bases: BaseModel

Generates questions and answers for sections and subsections of a document.

Parameters:

Name	Type	Description	Default
`openai_key`	`str`	The API key for OpenAI.	required
`openai_model`	`str`	The name of the OpenAI model to use.	required
`seed`	`int`	The seed for the random number generator. Defaults to 42.	required

Source code in docqa/core/data_generation.py

class QAPairGenerator(BaseModel):
    """
    Generates questions and answers for sections and subsections of a document.

    Args:
        openai_key (str): The API key for OpenAI.
        openai_model (str): The name of the OpenAI model to use.
        seed (int, optional): The seed for the random number generator. Defaults to 42.
    """

    class Config:
        arbitrary_types_allowed = True

    openai_key: str
    openai_model: str
    seed: int = 42

    @computed_field  # type: ignore[misc]
    @property
    def openai_client(self) -> OpenAI:
        return OpenAI(api_key=self.openai_key)

    output_format: str = (
        "Present your questions along with the detailed answers in the following JSON"
        ' format: [{"question": str, "answer": str}, ...].'
    )

    @computed_field  # type: ignore[misc]
    @property
    def question_types(self) -> dict[str, dict[str, str]]:
        return {
            "sparse": {
                "system_message": (
                    "You are a professional examiner. Your job is to give questions to"
                    " test people's understanding of a given document."
                ),
                "instruction": (
                    "You are given a document, your goal is:\n- construct a list of"
                    " different complex questions that can be answered based **solely**"
                    " on the given text.\n- make sure to cover all of the topics"
                    " described in the document.\n- include the answer for each"
                    " question, the answers should be as detailed as"
                    " possible.\n"
                ),
            },
            "dense": {
                "system_message": "You are a top university professor.",
                "instruction": (
                    "You are a top university professor. You have the below text and"
                    " you want to test the student's understanding of it. If you can"
                    " only ask {num_questions} question(s) but must cover all of the"
                    " content and the answer(s) to those questions must contain"
                    " **solely** the information presented in the given text, what"
                    " would you ask?\n"
                ),
            },
        }

    @staticmethod
    def sanitize_output_format(output: dict | list) -> list[dict]:
        """This static method takes in an `output` of type `dict` or `list` and returns
            a sanitized `list[dict]` output.

        Args:
            output (dict | list): The input `output` that needs to be sanitized.

        Returns:
            list[dict]: The sanitized output as a list of dictionaries.

        Raises:
            ValueError: If the `output` format is invalid.

        """
        if isinstance(output, dict):
            if list(output.keys()) == ["questions"]:
                if isinstance(output["questions"], list):
                    output = output["questions"]
            else:
                output = [output]
        elif isinstance(output, list):
            if list(output[0].keys()) == ["questions"]:
                output = output[0]["questions"]
        else:
            raise ValueError(f"Invalid output format: {type(output)}")

        return output  # type: ignore[return-value]

    def process(
        self,
        document: str,
        temperature: float = 1.0,
        question_type: str = "dense",
        num_questions: int = 5,
    ) -> tuple[list[dict[str, str]], list[dict]]:
        """
        Process the given document to generate a list of questions and answers.

        Args:
            document (str): The text document to process.
            temperature (float, optional): The temperature parameter for controlling
                the randomness of the output. Defaults to 1.0.
            question_type (str, optional): The type of questions to generate.
                Defaults to "dense".
            num_questions (int, optional): The number of questions to generate.
                Defaults to 5.

        Returns:
            tuple[list[dict[str, str]], list[dict]]: A tuple containing a list of
                questions and answers and a list of metadata.

        Raises:
            ValueError: If an invalid question type is provided.
        """
        if question_type not in self.question_types:
            raise ValueError(f"Invalid question type: {question_type}")

        system_message = self.question_types[question_type]["system_message"]
        instruction = self.question_types[question_type]["instruction"].format(
            num_questions=num_questions
        )
        messages = [
            {"role": "system", "content": system_message},
            {
                "role": "user",
                "content": instruction + self.output_format,
            },
            {"role": "user", "content": "\nHere is the given text:\n\n" + document},
        ]

        response = self.openai_client.chat.completions.create(
            model=self.openai_model,
            messages=messages,
            temperature=temperature,
            seed=self.seed,
            response_format={"type": "json_object"},
        )
        text = response.choices[0].message.content.strip()

        output = json.loads(text)
        output = self.sanitize_output_format(output)

        metadata = {}
        metadata["finish_reason"] = response.choices[0].finish_reason
        metadata["usage"] = response.usage.model_dump()

        return output, metadata  # type: ignore[return-value]

sanitize_output_format `staticmethod` ¶

sanitize_output_format(output)

This static method takes in an output of type dict or list and returns a sanitized list[dict] output.

Parameters:

Name	Type	Description	Default
`output`	`dict \| list`	The input `output` that needs to be sanitized.	required

Returns:

Type	Description
`list[dict]`	list[dict]: The sanitized output as a list of dictionaries.

Raises:

Type	Description
`ValueError`	If the `output` format is invalid.

Source code in docqa/core/data_generation.py

@staticmethod
def sanitize_output_format(output: dict | list) -> list[dict]:
    """This static method takes in an `output` of type `dict` or `list` and returns
        a sanitized `list[dict]` output.

    Args:
        output (dict | list): The input `output` that needs to be sanitized.

    Returns:
        list[dict]: The sanitized output as a list of dictionaries.

    Raises:
        ValueError: If the `output` format is invalid.

    """
    if isinstance(output, dict):
        if list(output.keys()) == ["questions"]:
            if isinstance(output["questions"], list):
                output = output["questions"]
        else:
            output = [output]
    elif isinstance(output, list):
        if list(output[0].keys()) == ["questions"]:
            output = output[0]["questions"]
    else:
        raise ValueError(f"Invalid output format: {type(output)}")

    return output  # type: ignore[return-value]

process ¶

process(document, temperature=1.0, question_type='dense', num_questions=5)

Process the given document to generate a list of questions and answers.

Parameters:

Name	Type	Description	Default
`document`	`str`	The text document to process.	required
`temperature`	`float`	The temperature parameter for controlling the randomness of the output. Defaults to 1.0.	`1.0`
`question_type`	`str`	The type of questions to generate. Defaults to "dense".	`'dense'`
`num_questions`	`int`	The number of questions to generate. Defaults to 5.	`5`

Returns:

Type	Description
`tuple[list[dict[str, str]], list[dict]]`	tuple[list[dict[str, str]], list[dict]]: A tuple containing a list of questions and answers and a list of metadata.

Raises:

Type	Description
`ValueError`	If an invalid question type is provided.

Source code in docqa/core/data_generation.py

def process(
    self,
    document: str,
    temperature: float = 1.0,
    question_type: str = "dense",
    num_questions: int = 5,
) -> tuple[list[dict[str, str]], list[dict]]:
    """
    Process the given document to generate a list of questions and answers.

    Args:
        document (str): The text document to process.
        temperature (float, optional): The temperature parameter for controlling
            the randomness of the output. Defaults to 1.0.
        question_type (str, optional): The type of questions to generate.
            Defaults to "dense".
        num_questions (int, optional): The number of questions to generate.
            Defaults to 5.

    Returns:
        tuple[list[dict[str, str]], list[dict]]: A tuple containing a list of
            questions and answers and a list of metadata.

    Raises:
        ValueError: If an invalid question type is provided.
    """
    if question_type not in self.question_types:
        raise ValueError(f"Invalid question type: {question_type}")

    system_message = self.question_types[question_type]["system_message"]
    instruction = self.question_types[question_type]["instruction"].format(
        num_questions=num_questions
    )
    messages = [
        {"role": "system", "content": system_message},
        {
            "role": "user",
            "content": instruction + self.output_format,
        },
        {"role": "user", "content": "\nHere is the given text:\n\n" + document},
    ]

    response = self.openai_client.chat.completions.create(
        model=self.openai_model,
        messages=messages,
        temperature=temperature,
        seed=self.seed,
        response_format={"type": "json_object"},
    )
    text = response.choices[0].message.content.strip()

    output = json.loads(text)
    output = self.sanitize_output_format(output)

    metadata = {}
    metadata["finish_reason"] = response.choices[0].finish_reason
    metadata["usage"] = response.usage.model_dump()

    return output, metadata  # type: ignore[return-value]

AnswerGenerator ¶

Bases: BaseModel

Generate an answer to a question based on a reference.

Parameters:

Name	Type	Description	Default
`openai_key`	`str`	The OpenAI API key.	required
`openai_model`	`str`	The name of the OpenAI model to use.	required
`seed`	`int`	The seed for the random number generator. Defaults to 42.	required

Source code in docqa/core/data_generation.py

class AnswerGenerator(BaseModel):
    """Generate an answer to a question based on a reference.

    Args:
        openai_key (str): The OpenAI API key.
        openai_model (str): The name of the OpenAI model to use.
        seed (int, optional): The seed for the random number generator. Defaults to 42.
    """

    class Config:
        arbitrary_types_allowed = True

    openai_key: str
    openai_model: str
    seed: int = 42

    @computed_field  # type: ignore[misc]
    @property
    def openai_client(self) -> OpenAI:
        return OpenAI(api_key=self.openai_key)

    system_message: str = (
        "You are a trusted factual chatbot. You always answer questions based strictly"
        " on the provided reference."
    )
    instruction: str = (
        "Reference(s):\n\n{reference}\n\nStrictly according to the provided"
        " reference(s), give an answer as detailed as possible to the following"
        " question: {question}"
    )

    def process(
        self,
        question: str,
        reference: str,
        temperature: float = 1.0,
    ) -> tuple[str, dict]:
        """
        Process the given question and generate a response using the OpenAI model.

        Parameters:
            question (str): The question to be processed.
            reference (str): The reference string for the instruction.
            temperature (float, optional): The temperature parameter for generating the
                response. Higher values (e.g., 1.0) make the output more random, while
                lower values (e.g., 0.2) make it more focused and deterministic.
                Defaults to 1.0.

        Returns:
            Tuple[str, dict]: A tuple containing the generated answer and metadata.

        Output dict structure:
            - answer (str): The generated answer as a string.
            - metadata (dict): Additional metadata about the response.
                - finish_reason (str): The reason why the completion finished.
                - usage (dict): Usage statistics of the completion.
                    - completed_tokens (int): The number of tokens used for
                        completion.
                    - prompt_tokens (int): The number of tokens used for the prompt.
                    - total_tokens (int): The total number of tokens used.
        """
        messages = [
            {"role": "system", "content": self.system_message},
            {
                "role": "user",
                "content": self.instruction.format(
                    reference=reference, question=question
                ),
            },
        ]

        response = self.openai_client.chat.completions.create(
            model=self.openai_model,
            messages=messages,
            temperature=temperature,
            seed=self.seed,
        )
        answer = response.choices[0].message.content.strip()

        metadata = {}
        metadata["finish_reason"] = response.choices[0].finish_reason
        metadata["usage"] = {
            "completed_tokens": response.usage.completion_tokens,
            "prompt_tokens": response.usage.prompt_tokens,
            "total_tokens": response.usage.total_tokens,
        }

        return answer, metadata

process ¶

process(question, reference, temperature=1.0)

Process the given question and generate a response using the OpenAI model.

Parameters:

Name	Type	Description	Default
`question`	`str`	The question to be processed.	required
`reference`	`str`	The reference string for the instruction.	required
`temperature`	`float`	The temperature parameter for generating the response. Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.2) make it more focused and deterministic. Defaults to 1.0.	`1.0`

Returns:

Type	Description
`tuple[str, dict]`	Tuple[str, dict]: A tuple containing the generated answer and metadata.

Output dict structure

answer (str): The generated answer as a string.
metadata (dict): Additional metadata about the response.
- finish_reason (str): The reason why the completion finished.
- usage (dict): Usage statistics of the completion.
  - completed_tokens (int): The number of tokens used for completion.
  - prompt_tokens (int): The number of tokens used for the prompt.
  - total_tokens (int): The total number of tokens used.

Source code in docqa/core/data_generation.py

def process(
    self,
    question: str,
    reference: str,
    temperature: float = 1.0,
) -> tuple[str, dict]:
    """
    Process the given question and generate a response using the OpenAI model.

    Parameters:
        question (str): The question to be processed.
        reference (str): The reference string for the instruction.
        temperature (float, optional): The temperature parameter for generating the
            response. Higher values (e.g., 1.0) make the output more random, while
            lower values (e.g., 0.2) make it more focused and deterministic.
            Defaults to 1.0.

    Returns:
        Tuple[str, dict]: A tuple containing the generated answer and metadata.

    Output dict structure:
        - answer (str): The generated answer as a string.
        - metadata (dict): Additional metadata about the response.
            - finish_reason (str): The reason why the completion finished.
            - usage (dict): Usage statistics of the completion.
                - completed_tokens (int): The number of tokens used for
                    completion.
                - prompt_tokens (int): The number of tokens used for the prompt.
                - total_tokens (int): The total number of tokens used.
    """
    messages = [
        {"role": "system", "content": self.system_message},
        {
            "role": "user",
            "content": self.instruction.format(
                reference=reference, question=question
            ),
        },
    ]

    response = self.openai_client.chat.completions.create(
        model=self.openai_model,
        messages=messages,
        temperature=temperature,
        seed=self.seed,
    )
    answer = response.choices[0].message.content.strip()

    metadata = {}
    metadata["finish_reason"] = response.choices[0].finish_reason
    metadata["usage"] = {
        "completed_tokens": response.usage.completion_tokens,
        "prompt_tokens": response.usage.prompt_tokens,
        "total_tokens": response.usage.total_tokens,
    }

    return answer, metadata

generate_top_sections_questions ¶

generate_top_sections_questions(doc_tree, output_file, openai_key='', openai_model='', seed=42, temperature=1.0)

Generate the top sections with questions based on the provided document tree.

Parameters:

Name	Type	Description	Default
`doc_tree`	`dict`	The document tree representing the sections of the document.	required
`output_file`	`Path`	The path to the output file where the top sections with uestions will be saved.	required
`openai_key`	`str`	The OpenAI API key. Defaults to an empty string.	`''`
`openai_model`	`str`	The OpenAI model to use for question generation. Defaults to an empty string.	`''`
`seed`	`int`	The seed value for random number generation. Defaults to 42.	`42`
`temperature`	`float`	The temperature parameter for question generation. Defaults to 1.0.	`1.0`

Returns:

Name	Type	Description
`dict`	`dict`	The top sections with questions.

Source code in docqa/core/data_generation.py

def generate_top_sections_questions(
    doc_tree: dict,
    output_file: Path,
    openai_key: str = "",
    openai_model: str = "",
    seed: int = 42,
    temperature: float = 1.0,
) -> dict:
    """
    Generate the top sections with questions based on the provided document tree.

    Args:
        doc_tree (dict): The document tree representing the sections of the document.
        output_file (Path): The path to the output file where the top sections with
            uestions will be saved.
        openai_key (str, optional): The OpenAI API key. Defaults to an empty string.
        openai_model (str, optional): The OpenAI model to use for question generation.
            Defaults to an empty string.
        seed (int, optional): The seed value for random number generation.
            Defaults to 42.
        temperature (float, optional): The temperature parameter for question
            generation.
            Defaults to 1.0.

    Returns:
        dict: The top sections with questions.
    """
    output_file = Path(output_file)
    if output_file.exists():
        with open(output_file, "r", encoding="utf-8") as f:
            top_sections_with_questions = json.load(f)
        return top_sections_with_questions

    qa_gen = QAPairGenerator(
        openai_key=openai_key,
        openai_model=openai_model,
        seed=seed,
    )

    top_sections_with_questions = {}
    if doc_tree["text"]:
        top_sections_with_questions[""] = {
            "text": doc_tree["text"],
            "chunks_count": len(chunk_content(doc_tree["text"])),
        }

    for section in doc_tree.get("child_sections", []):
        full_text = get_section_full_text(section)
        top_sections_with_questions[section["heading"]] = {
            "text": full_text,
            "chunks_count": len(chunk_content(full_text)),
        }

    for heading, section in top_sections_with_questions.items():
        print(f"Generating questions for {heading}")
        dense_questions, _ = qa_gen.process(
            section["text"],
            question_type="dense",
            num_questions=section["chunks_count"],
            temperature=temperature,
        )
        sparse_questions, _ = qa_gen.process(
            section["text"],
            question_type="sparse",
            temperature=temperature,
        )
        top_sections_with_questions[heading]["dense_questions"] = dense_questions
        top_sections_with_questions[heading]["sparse_questions"] = sparse_questions

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(top_sections_with_questions, f, indent=4, ensure_ascii=False)

    return top_sections_with_questions

generate_long_answers_for_sections_questions ¶

generate_long_answers_for_sections_questions(sections_with_questions, output_file, openai_key='', openai_model='', seed=42, temperature=1.0)

Generate long answers for sections' questions.

Parameters:

Name	Type	Description	Default
`sections_with_questions`	`dict`	A dictionary containing sections with their corresponding questions.	required
`output_file`	`Path`	The path to the output file where the generated long answers will be stored.	required
`openai_key`	`str`	The API key for OpenAI. Defaults to an empty string.	`''`
`openai_model`	`str`	The name of the OpenAI model to use. Defaults to an empty string.	`''`
`seed`	`int`	The seed value for random number generation. Defaults to 42.	`42`
`temperature`	`float`	The temperature parameter for generating answers. Defaults to 1.0.	`1.0`

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary containing sections with their corresponding questions and generated long answers.

Source code in docqa/core/data_generation.py

def generate_long_answers_for_sections_questions(
    sections_with_questions: dict,
    output_file: Path,
    openai_key: str = "",
    openai_model: str = "",
    seed: int = 42,
    temperature: float = 1.0,
) -> dict:
    """
    Generate long answers for sections' questions.

    Args:
        sections_with_questions (dict): A dictionary containing sections with their
            corresponding questions.
        output_file (Path): The path to the output file where the generated long answers
            will be stored.
        openai_key (str, optional): The API key for OpenAI. Defaults to an empty string.
        openai_model (str, optional): The name of the OpenAI model to use. Defaults to
            an empty string.
        seed (int, optional): The seed value for random number generation.
            Defaults to 42.
        temperature (float, optional): The temperature parameter for generating answers.
            Defaults to 1.0.

    Returns:
        dict: A dictionary containing sections with their corresponding questions and
            generated long answers.
    """
    output_file = Path(output_file)
    if output_file.exists():
        with open(output_file, "r", encoding="utf-8") as f:
            sections_with_questions_and_long_answers = json.load(f)
        return sections_with_questions_and_long_answers

    answer_gen = AnswerGenerator(
        openai_key=openai_key,
        openai_model=openai_model,
        seed=seed,
    )
    for heading, section in sections_with_questions.items():
        print(f"Generating long answers for dense questions of {heading}")
        reference = f"===\n[source: {heading}]\n{section['text']}\n===\n"

        for question in section["dense_questions"]:
            answer, _ = answer_gen.process(
                question=question["question"],
                reference=reference,
                temperature=temperature,
            )
            question["long_answer"] = answer

        print(f"Generating long answers for sparse questions of {heading}")
        for question in section["sparse_questions"]:
            answer, _ = answer_gen.process(
                question=question["question"],
                reference=reference,
                temperature=temperature,
            )
            question["long_answer"] = answer

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(sections_with_questions, f, indent=4, ensure_ascii=False)

    return sections_with_questions

make_simple_sample_for_openai ¶

make_simple_sample_for_openai(question, answer)

Generates a simple sample for OpenAI chat conversation.

Parameters:

Name	Type	Description	Default
`question`	`str`	The user's question.	required
`answer`	`str`	The assistant's answer.	required

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary containing the chat conversation sample.

Example

make_simple_sample_for_openai("What is the capital of France?", "Paris")

Source code in docqa/core/data_generation.py

def make_simple_sample_for_openai(question: str, answer: str) -> dict:
    """
    Generates a simple sample for OpenAI chat conversation.

    Args:
        question (str): The user's question.
        answer (str): The assistant's answer.

    Returns:
        dict: A dictionary containing the chat conversation sample.

    Example:
        make_simple_sample_for_openai("What is the capital of France?", "Paris")
    """
    return {
        "messages": [
            {
                "role": "system",
                "content": (
                    "You are a trusted factual chatbot that only answers questions"
                    " about generative agents."
                ),
            },
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer},
        ]
    }

make_instruction_sample_for_openai ¶

make_instruction_sample_for_openai(question, answer, references)

Generates an instruction sample for OpenAI chat conversation.

Parameters:

Name	Type	Description	Default
`question`	`str`	The question to be used in the instruction.	required
`answer`	`str`	The answer to be used in the instruction.	required
`references`	`list[str]`	A list of reference texts to be included in the instruction.	required

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary containing the chat conversation sample.

Source code in docqa/core/data_generation.py

def make_instruction_sample_for_openai(
    question: str, answer: str, references: list[str]
) -> dict:
    """
    Generates an instruction sample for OpenAI chat conversation.

    Args:
        question (str): The question to be used in the instruction.
        answer (str): The answer to be used in the instruction.
        references (list[str]): A list of reference texts to be included in the
            instruction.

    Returns:
        dict: A dictionary containing the chat conversation sample.
    """
    reference_text = "\n\n".join(["===\n" + ref + "\n===" for ref in references])
    system_message = AnswerGenerator.model_fields["system_message"].default
    instruction = AnswerGenerator.model_fields["instruction"].default
    instruction = instruction.format(reference=reference_text, question=question)
    return {
        "messages": [
            {
                "role": "system",
                "content": system_message,
            },
            {
                "role": "user",
                "content": instruction,
            },
            {"role": "assistant", "content": answer},
        ]
    }

Data generation

QAPairGenerator ¶

sanitize_output_format staticmethod ¶

process ¶

AnswerGenerator ¶

process ¶

generate_top_sections_questions ¶

generate_long_answers_for_sections_questions ¶

make_simple_sample_for_openai ¶

make_instruction_sample_for_openai ¶

sanitize_output_format `staticmethod` ¶