Skip to content

Create dataset

create_openai_dataset

create_openai_dataset(sections_qa_data_flatten, section_type='main', question_type='dense', answer_type='long', prompt_type='instruction')

Generate a dataset for OpenAI based on the given sections QA data.

Parameters:

Name Type Description Default
sections_qa_data_flatten dict

A dictionary containing the flattened sections QA data.

required
section_type Literal['main', 'summary', 'metadata', 'extra']

The type of section to include in the dataset. Defaults to "main".

'main'
question_type Literal['dense', 'sparse']

The type of question to include in the dataset. Defaults to "dense".

'dense'
answer_type Literal['long', 'short']

The type of answer to include in the dataset. Defaults to "long".

'long'
prompt_type Literal['instruction', 'simple']

The type of prompt to use in the dataset. Defaults to "instruction".

'instruction'

Returns:

Type Description
list[dict]

list[dict]: The generated dataset for OpenAI.

Note
  • The dataset is generated based on the specified parameters.
  • Only sections that exist in the sections QA data will be included in the dataset.
  • For each section, the questions and answers are extracted based on the question type and answer type.
  • Depending on the prompt type, different sample generation functions are used to create the samples.
  • The dataset is a list of dictionaries, where each dictionary represents a sample.
Source code in docqa/demo/create_dataset.py
def create_openai_dataset(
    sections_qa_data_flatten: dict,
    section_type: Literal[
        "main", "summary", "metadata", "extra"
    ] = "main",  # keys in SECTIONS
    question_type: Literal["dense", "sparse"] = "dense",
    answer_type: Literal["long", "short"] = "long",
    prompt_type: Literal["instruction", "simple"] = "instruction",
) -> list[dict]:
    """
    Generate a dataset for OpenAI based on the given sections QA data.

    Parameters:
        sections_qa_data_flatten (dict): A dictionary containing the flattened sections
            QA data.
        section_type (Literal["main", "summary", "metadata", "extra"], optional): The
            type of section to include in the dataset. Defaults to "main".
        question_type (Literal["dense", "sparse"], optional): The type of question to
            include in the dataset. Defaults to "dense".
        answer_type (Literal["long", "short"], optional): The type of answer to include
            in the dataset. Defaults to "long".
        prompt_type (Literal["instruction", "simple"], optional): The type of prompt to
            use in the dataset. Defaults to "instruction".

    Returns:
        list[dict]: The generated dataset for OpenAI.

    Note:
        - The dataset is generated based on the specified parameters.
        - Only sections that exist in the sections QA data will be included in the
        dataset.
        - For each section, the questions and answers are extracted based on the
        question type and answer type.
        - Depending on the prompt type, different sample generation functions are used
        to create the samples.
        - The dataset is a list of dictionaries, where each dictionary represents a
        sample.
    """
    dataset = []
    for heading in SECTIONS[section_type]:
        if heading not in sections_qa_data_flatten:
            continue
        section = sections_qa_data_flatten[heading]
        qa_list = section[f"{question_type}_questions"]
        for item in qa_list:
            question = item["question"]
            answer = item["answer"] if answer_type == "short" else item["long_answer"]
            if prompt_type == "simple":
                sample = make_simple_sample_for_openai(question, answer)
            elif prompt_type == "instruction":
                reference = f"[source: {heading}]\n{section['text']}\n"
                sample = make_instruction_sample_for_openai(
                    question=question,
                    answer=answer,
                    references=[reference],
                )
            dataset.append(sample)

    return dataset

pdf_to_qa_data

pdf_to_qa_data(output_dir, pdf_file)

Generates a QA data dictionary from a PDF file.

Parameters:

Name Type Description Default
output_dir Path

The directory where the output files will be saved.

required
pdf_file Path

The path to the PDF file.

required

Returns:

Name Type Description
dict dict

The generated QA data dictionary.

Source code in docqa/demo/create_dataset.py
def pdf_to_qa_data(output_dir: Path, pdf_file: Path) -> dict:
    """
    Generates a QA data dictionary from a PDF file.

    Args:
        output_dir (Path): The directory where the output files will be saved.
        pdf_file (Path): The path to the PDF file.

    Returns:
        dict: The generated QA data dictionary.
    """
    doc_tree_file = output_dir / "doc_tree.json"

    if doc_tree_file.exists():
        with open(doc_tree_file, "r", encoding="utf-8") as f:
            doc_tree = json.load(f)
    else:
        doc_tree = build_doc_tree_from_pdf(pdf_file, output_dir=output_dir)

    top_sections_qa_data = generate_top_sections_questions(
        doc_tree,
        output_file=output_dir / "top_sections_qa_data.json",
        openai_key=os.getenv("OPENAI_API_KEY", ""),
        openai_model=os.getenv("OPENAI_MODEL", ""),
        seed=int(os.getenv("SEED", 42)),
        temperature=1.0,
    )
    top_sections_qa_data = generate_long_answers_for_sections_questions(
        top_sections_qa_data,
        output_file=output_dir / "top_sections_qa_data_long_answers.json",
        openai_key=os.getenv("OPENAI_API_KEY", ""),
        openai_model=os.getenv("OPENAI_MODEL", ""),
        seed=int(os.getenv("SEED", 42)),
        temperature=1.0,
    )

    return top_sections_qa_data