Create dataset

create_openai_dataset ¶

create_openai_dataset(sections_qa_data_flatten, section_type='main', question_type='dense', answer_type='long', prompt_type='instruction')

Generate a dataset for OpenAI based on the given sections QA data.

Parameters:

Name	Type	Description	Default
`sections_qa_data_flatten`	`dict`	A dictionary containing the flattened sections QA data.	required
`section_type`	`Literal['main', 'summary', 'metadata', 'extra']`	The type of section to include in the dataset. Defaults to "main".	`'main'`
`question_type`	`Literal['dense', 'sparse']`	The type of question to include in the dataset. Defaults to "dense".	`'dense'`
`answer_type`	`Literal['long', 'short']`	The type of answer to include in the dataset. Defaults to "long".	`'long'`
`prompt_type`	`Literal['instruction', 'simple']`	The type of prompt to use in the dataset. Defaults to "instruction".	`'instruction'`

Returns:

Type	Description
`list[dict]`	list[dict]: The generated dataset for OpenAI.

Note

The dataset is generated based on the specified parameters.
Only sections that exist in the sections QA data will be included in the dataset.
For each section, the questions and answers are extracted based on the question type and answer type.
Depending on the prompt type, different sample generation functions are used to create the samples.
The dataset is a list of dictionaries, where each dictionary represents a sample.

Source code in docqa/demo/create_dataset.py

def create_openai_dataset(
    sections_qa_data_flatten: dict,
    section_type: Literal[
        "main", "summary", "metadata", "extra"
    ] = "main",  # keys in SECTIONS
    question_type: Literal["dense", "sparse"] = "dense",
    answer_type: Literal["long", "short"] = "long",
    prompt_type: Literal["instruction", "simple"] = "instruction",
) -> list[dict]:
    """
    Generate a dataset for OpenAI based on the given sections QA data.

    Parameters:
        sections_qa_data_flatten (dict): A dictionary containing the flattened sections
            QA data.
        section_type (Literal["main", "summary", "metadata", "extra"], optional): The
            type of section to include in the dataset. Defaults to "main".
        question_type (Literal["dense", "sparse"], optional): The type of question to
            include in the dataset. Defaults to "dense".
        answer_type (Literal["long", "short"], optional): The type of answer to include
            in the dataset. Defaults to "long".
        prompt_type (Literal["instruction", "simple"], optional): The type of prompt to
            use in the dataset. Defaults to "instruction".

    Returns:
        list[dict]: The generated dataset for OpenAI.

    Note:
        - The dataset is generated based on the specified parameters.
        - Only sections that exist in the sections QA data will be included in the
        dataset.
        - For each section, the questions and answers are extracted based on the
        question type and answer type.
        - Depending on the prompt type, different sample generation functions are used
        to create the samples.
        - The dataset is a list of dictionaries, where each dictionary represents a
        sample.
    """
    dataset = []
    for heading in SECTIONS[section_type]:
        if heading not in sections_qa_data_flatten:
            continue
        section = sections_qa_data_flatten[heading]
        qa_list = section[f"{question_type}_questions"]
        for item in qa_list:
            question = item["question"]
            answer = item["answer"] if answer_type == "short" else item["long_answer"]
            if prompt_type == "simple":
                sample = make_simple_sample_for_openai(question, answer)
            elif prompt_type == "instruction":
                reference = f"[source: {heading}]\n{section['text']}\n"
                sample = make_instruction_sample_for_openai(
                    question=question,
                    answer=answer,
                    references=[reference],
                )
            dataset.append(sample)

    return dataset

pdf_to_qa_data ¶

pdf_to_qa_data(output_dir, pdf_file)

Generates a QA data dictionary from a PDF file.

Parameters:

Name	Type	Description	Default
`output_dir`	`Path`	The directory where the output files will be saved.	required
`pdf_file`	`Path`	The path to the PDF file.	required

Returns:

Name	Type	Description
`dict`	`dict`	The generated QA data dictionary.

Source code in docqa/demo/create_dataset.py

def pdf_to_qa_data(output_dir: Path, pdf_file: Path) -> dict:
    """
    Generates a QA data dictionary from a PDF file.

    Args:
        output_dir (Path): The directory where the output files will be saved.
        pdf_file (Path): The path to the PDF file.

    Returns:
        dict: The generated QA data dictionary.
    """
    doc_tree_file = output_dir / "doc_tree.json"

    if doc_tree_file.exists():
        with open(doc_tree_file, "r", encoding="utf-8") as f:
            doc_tree = json.load(f)
    else:
        doc_tree = build_doc_tree_from_pdf(pdf_file, output_dir=output_dir)

    top_sections_qa_data = generate_top_sections_questions(
        doc_tree,
        output_file=output_dir / "top_sections_qa_data.json",
        openai_key=os.getenv("OPENAI_API_KEY", ""),
        openai_model=os.getenv("OPENAI_MODEL", ""),
        seed=int(os.getenv("SEED", 42)),
        temperature=1.0,
    )
    top_sections_qa_data = generate_long_answers_for_sections_questions(
        top_sections_qa_data,
        output_file=output_dir / "top_sections_qa_data_long_answers.json",
        openai_key=os.getenv("OPENAI_API_KEY", ""),
        openai_model=os.getenv("OPENAI_MODEL", ""),
        seed=int(os.getenv("SEED", 42)),
        temperature=1.0,
    )

    return top_sections_qa_data