Skip to content

Markdown

MarkdownTidier

Bases: BaseModel

Tidies the given markdown text using OpenAI's model.

Parameters:

Name Type Description Default
openai_key str

The OpenAI API key.

required
openai_model str

The OpenAI model to use.

required
seed int

The seed for the random number generator. Defaults to 42.

required
system_message str

The system message for the OpenAI model.

required
instruction str

The instruction for the OpenAI model.

required
api_client OpenAI

The OpenAI client.

required
Source code in docqa/core/markdown.py
class MarkdownTidier(BaseModel):
    """
    Tidies the given markdown text using OpenAI's model.

    Args:
        openai_key (str): The OpenAI API key.
        openai_model (str): The OpenAI model to use.
        seed (int, optional): The seed for the random number generator. Defaults to 42.
        system_message (str, optional): The system message for the OpenAI model.
        instruction (str, optional): The instruction for the OpenAI model.
        api_client (OpenAI): The OpenAI client.
    """

    class Config:
        arbitrary_types_allowed = True

    openai_key: str
    openai_model: str
    seed: int = 42
    system_message: str = (
        "You are a professional editor. Your job is to reconstruct the broken markdown"
        " text."
    )
    instruction: str = (
        "You are given a markdown text which was converted from pdf and thus has "
        "mixed-up sentences and paragraphs structure, your job is:\n"
        "- reconstruct the text with proper sentences and paragraphs.\n"
        "- keep the headings unchanged.\n"
        "- keep the original content verbatim.\n"
        "- discard unrelated text.\n"
        "Answer with **only the reconstructed text**  and nothing else.\n"
    )

    @computed_field  # type: ignore[misc]
    @property
    def openai_client(self) -> OpenAI:
        return OpenAI(api_key=self.openai_key)

    def process(self, markdown_text: str, temperature: float = 0.7) -> tuple[str, dict]:
        """
        Generates a response to a given markdown text using the OpenAI chat model.

        Args:
            markdown_text (str): The input markdown text to generate a response for.
            temperature (float, optional): The temperature of the model's output.
                Higher values make the output more random, while lower values make it
                more focused and deterministic. Defaults to 0.7.

        Returns:
            str: The generated response text.
            dict: Metadata about the completion process, including the finish reason
                and token usage.

        Example:
            ```python
            >>> process("Hello, how are you?")
            ```
            ```shel
            (
                'I am fine, thank you!',
                {
                    'finish_reason': 'stop',
                    'usage': {
                        'completed_tokens': 48,
                        'prompt_tokens': 6,
                        'total_tokens': 54
                    }
                }
            )
            ```
        """
        messages = [
            {"role": "system", "content": self.system_message},
            {"role": "user", "content": self.instruction},
            {"role": "user", "content": markdown_text},
        ]

        response = self.openai_client.chat.completions.create(
            model=self.openai_model,
            messages=messages,
            temperature=temperature,
            seed=self.seed,
        )

        text = response.choices[0].message.content.strip()

        metadata = {}
        metadata["finish_reason"] = response.choices[0].finish_reason
        metadata["usage"] = {
            "completed_tokens": response.usage.completion_tokens,
            "prompt_tokens": response.usage.prompt_tokens,
            "total_tokens": response.usage.total_tokens,
        }

        return text, metadata

process

process(markdown_text, temperature=0.7)

Generates a response to a given markdown text using the OpenAI chat model.

Parameters:

Name Type Description Default
markdown_text str

The input markdown text to generate a response for.

required
temperature float

The temperature of the model's output. Higher values make the output more random, while lower values make it more focused and deterministic. Defaults to 0.7.

0.7

Returns:

Name Type Description
str str

The generated response text.

dict dict

Metadata about the completion process, including the finish reason and token usage.

Example

>>> process("Hello, how are you?")
(
    'I am fine, thank you!',
    {
        'finish_reason': 'stop',
        'usage': {
            'completed_tokens': 48,
            'prompt_tokens': 6,
            'total_tokens': 54
        }
    }
)

Source code in docqa/core/markdown.py
def process(self, markdown_text: str, temperature: float = 0.7) -> tuple[str, dict]:
    """
    Generates a response to a given markdown text using the OpenAI chat model.

    Args:
        markdown_text (str): The input markdown text to generate a response for.
        temperature (float, optional): The temperature of the model's output.
            Higher values make the output more random, while lower values make it
            more focused and deterministic. Defaults to 0.7.

    Returns:
        str: The generated response text.
        dict: Metadata about the completion process, including the finish reason
            and token usage.

    Example:
        ```python
        >>> process("Hello, how are you?")
        ```
        ```shel
        (
            'I am fine, thank you!',
            {
                'finish_reason': 'stop',
                'usage': {
                    'completed_tokens': 48,
                    'prompt_tokens': 6,
                    'total_tokens': 54
                }
            }
        )
        ```
    """
    messages = [
        {"role": "system", "content": self.system_message},
        {"role": "user", "content": self.instruction},
        {"role": "user", "content": markdown_text},
    ]

    response = self.openai_client.chat.completions.create(
        model=self.openai_model,
        messages=messages,
        temperature=temperature,
        seed=self.seed,
    )

    text = response.choices[0].message.content.strip()

    metadata = {}
    metadata["finish_reason"] = response.choices[0].finish_reason
    metadata["usage"] = {
        "completed_tokens": response.usage.completion_tokens,
        "prompt_tokens": response.usage.prompt_tokens,
        "total_tokens": response.usage.total_tokens,
    }

    return text, metadata

find_highest_markdown_heading_level

find_highest_markdown_heading_level(lines)

Takes a list of lines representing a markdown file as input. Finds the highest level of heading and returns it as an integer. Returns None if the text contains no headings.

Source

https://github.com/nestordemeure/question_extractor/blob/main/question_extractor/markdown.py

Parameters:

Name Type Description Default
lines list of str

A list of lines in the markdown file.

required

Returns:

Type Description
int | None

int | None: The highest heading level as an integer, or None if no headings are found.

Source code in docqa/core/markdown.py
def find_highest_markdown_heading_level(lines: list[str]) -> int | None:
    """
    Takes a list of lines representing a markdown file as input.
    Finds the highest level of heading and returns it as an integer.
    Returns None if the text contains no headings.

    Source:
        https://github.com/nestordemeure/question_extractor/blob/main/question_extractor/markdown.py

    Args:
        lines (list of str): A list of lines in the markdown file.

    Returns:
        int | None: The highest heading level as an integer, or None if no headings
            are found.
    """
    highest_heading_level = None
    code_section = False

    # Iterate through the lines in the markdown file
    for line in lines:
        """
        Check code section e.g.:
            ```bash
            # Trace an IP packet between two Pods
            antctl trace-packet -S ns1/pod1 -D ns2/pod2
            # Trace a Service request from a local Pod
            antctl trace-packet -S ns1/pod1 -D ns2/svc2 -f "tcp,tcp_dst=80"
            # Trace the Service reply packet (assuming "ns2/pod2" is the Service
            # backend Pod)
            antctl trace-packet -D ns1/pod1 -S ns2/pod2 -f "tcp,tcp_src=80"
            # Trace an IP packet from a Pod to gateway port
            antctl trace-packet -S ns1/pod1 -D antrea-gw0
            # Trace a UDP packet from a Pod to an IP address
            antctl trace-packet -S ns1/pod1 -D 10.1.2.3 -f udp,udp_dst=1234
            # Trace a UDP packet from an IP address to a Pod
            antctl trace-packet -D ns1/pod1 -S 10.1.2.3 -f udp,udp_src=1234
            ```
        Here # is a code comment not the md level symbole
        """
        if line.startswith("```"):
            code_section = not code_section
        # Check if the line starts with a heading
        if line.startswith("#") and not code_section:
            # Calculate the heading level based on the number of '#' characters
            current_heading_level = len(line.split()[0])

            # Update the highest_heading_level if it is None or if the current_heading_
            # level is higher
            if (highest_heading_level is None) or (
                current_heading_level < highest_heading_level
            ):
                highest_heading_level = current_heading_level

    return highest_heading_level

pdf_to_markdown

pdf_to_markdown(pdf_file, output_file, max_pages=None, parallel_factor=1, cache_dir=Path('.cache/pdf_to_markdown/'))

Converts a PDF file to Markdown format and saves the result to an output file.

Parameters:

Name Type Description Default
pdf_file Path

The path to the PDF file to be converted.

required
output_file Path

The path to the output file where the converted Markdown will be saved.

required
max_pages int | None

The maximum number of pages to convert. Defaults to None.

None
parallel_factor int

The number of parallel processes to use for conversion. Defaults to 1.

1
cache_dir Path

The directory to use for caching the conversion

Path('.cache/pdf_to_markdown/')

Returns:

Name Type Description
str str

The converted Markdown text.

Source code in docqa/core/markdown.py
def pdf_to_markdown(
    pdf_file: Path,
    output_file: Path,
    max_pages: int | None = None,
    parallel_factor: int = 1,
    cache_dir: Path = Path(".cache/pdf_to_markdown/"),
) -> str:
    """
    Converts a PDF file to Markdown format and saves the result to an output file.

    Args:
        pdf_file (Path): The path to the PDF file to be converted.
        output_file (Path): The path to the output file where the converted Markdown
            will be saved.
        max_pages (int | None, optional): The maximum number of pages to convert.
            Defaults to None.
        parallel_factor (int, optional): The number of parallel processes to use for
            conversion. Defaults to 1.
        cache_dir (Path, optional): The directory to use for caching the conversion

    Returns:
        str: The converted Markdown text.
    """
    markdown_text, metadata = convert_single_pdf(
        pdf_file,
        model_lst=load_all_models(),
        max_pages=max_pages,
        parallel_factor=parallel_factor,
        cache_dir=cache_dir,
    )

    if output_file is not None:
        output_file = Path(output_file)
        output_file.parent.mkdir(exist_ok=True, parents=True)
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(markdown_text)

    return markdown_text

filter_empty_sections

filter_empty_sections(sections)

Filters out empty sections from a list of tuples.

Parameters:

Name Type Description Default
sections list[tuple[str, str]]

A list of tuples representing sections, where each tuple contains a heading (str) and content (str).

required

Returns:

Type Description
list[tuple[str, str]]

list[tuple[str, str]]: A list of tuples representing non-empty sections, where each tuple contains a heading (str) and content (str).

Source code in docqa/core/markdown.py
def filter_empty_sections(sections: list[tuple[str, str]]) -> list[tuple[str, str]]:
    """
    Filters out empty sections from a list of tuples.

    Args:
        sections (list[tuple[str, str]]): A list of tuples representing sections, where
            each tuple contains a heading (str) and content (str).

    Returns:
        list[tuple[str, str]]: A list of tuples representing non-empty sections, where
            each tuple contains a heading (str) and content (str).
    """
    return [(heading, content) for heading, content in sections if heading or content]

merge_abstract_with_previous_sections

merge_abstract_with_previous_sections(sections)

If found an Abstract section then assume it's a research paper and merge it with all previous sections, this is because the authors section might have more column thus messes up the parsed order

Parameters:

Name Type Description Default
sections list[tuple[str, str]]

A list of tuples representing sections, where each tuple contains a heading (str) and content (str).

required

Returns:

Type Description

list[tuple[str, str]]: A list of tuples representing merged sections, where each tuple contains a heading (str) and content (str).

Source code in docqa/core/markdown.py
def merge_abstract_with_previous_sections(sections: list[tuple[str, str]]):
    """
    If found an Abstract section then assume it's a research paper and merge it with
    all previous sections, this is because the authors section might have more column
    thus messes up the parsed order

    Args:
        sections (list[tuple[str, str]]): A list of tuples representing sections, where
            each tuple contains a heading (str) and content (str).

    Returns:
        list[tuple[str, str]]: A list of tuples representing merged sections, where
            each tuple contains a heading (str) and content (str).
    """

    if len(sections) < 2:
        return sections

    first_heading = sections[0][0]
    text_sections = [sections[0][1]]

    for i in range(1, len(sections)):
        heading, content = sections[i]
        current_text = f"{heading}\n\n{content}"
        text_sections.append(current_text)
        if re.sub(r"[^a-zA-Z]", "", heading).lower() == "abstract":
            combined_text = "\n\n".join(text_sections)
            return [(first_heading, combined_text)] + sections[i + 1 :]

    return sections

preprocess_sections

preprocess_sections(sections)

Preprocesses the given list of sections by filtering out any empty sections and merging any abstract sections with their previous sections.

Parameters:

Name Type Description Default
sections List[Tuple[str, str]]

A list of tuples representing sections. Each tuple contains two strings: the title of the section and the content of the section.

required

Returns:

Type Description
list[tuple[str, str]]

List[Tuple[str, str]]: A list of tuples representing the preprocessed sections. Each tuple contains two strings: the title of the section and the content of the section.

Source code in docqa/core/markdown.py
def preprocess_sections(sections: list[tuple[str, str]]) -> list[tuple[str, str]]:
    """
    Preprocesses the given list of sections by filtering out any empty sections and
    merging any abstract sections with their previous sections.

    Args:
        sections (List[Tuple[str, str]]): A list of tuples representing sections.
            Each tuple contains two strings: the title of the section and the content
            of the section.

    Returns:
        List[Tuple[str, str]]: A list of tuples representing the preprocessed sections.
            Each tuple contains two strings: the title of the section and the content
            of the section.
    """
    sections = filter_empty_sections(sections)
    sections = merge_abstract_with_previous_sections(sections)

    return sections

text_similarity_score

text_similarity_score(text1, text2)

Compute the similarity score between two texts.

Parameters:

Name Type Description Default
text1 str

The first text.

required
text2 str

The second text.

required

Returns:

Name Type Description
float float

The similarity score between the two texts.

Source code in docqa/core/markdown.py
def text_similarity_score(text1: str, text2: str) -> float:
    """
    Compute the similarity score between two texts.

    Args:
        text1 (str): The first text.
        text2 (str): The second text.

    Returns:
        float: The similarity score between the two texts.
    """
    # remove special characters
    text1 = re.sub(r"[^a-zA-Z0-9\s]", "", text1.lower())
    text2 = re.sub(r"[^a-zA-Z0-9\s]", "", text2.lower())

    # split into words, the words are sorted to allow shuffling of content
    text1_words = sorted(text1.split())
    text2_words = sorted(text2.split())

    return 1 - editdistance.eval(text1_words, text2_words) / (
        max(len(text1_words), len(text2_words)) + 1e-6
    )

heading_similarity_score

heading_similarity_score(heading1, heading2)

Calculate the similarity score between two headings.

Parameters:

Name Type Description Default
heading1 str

The first heading.

required
heading2 str

The second heading.

required

Returns:

Name Type Description
float float

The similarity score between the two headings.

Source code in docqa/core/markdown.py
def heading_similarity_score(heading1: str, heading2: str) -> float:
    """
    Calculate the similarity score between two headings.

    Parameters:
        heading1 (str): The first heading.
        heading2 (str): The second heading.

    Returns:
        float: The similarity score between the two headings.
    """
    return 1 - editdistance.eval(heading1, heading2) / (
        max(len(heading1), len(heading2)) + 1e-6
    )

preserve_content

preserve_content(heading, old_content, new_text, heading_similarity_threshold=0.7, content_similarity_threshold=0.8)

Calculate the similarity between the given heading and new heading using a threshold. If the similarity score is above the threshold, the new text still contains the heading, so the content after the heading is extracted as the new content. If the similarity score is below the threshold, the new text does not contain the heading, so the entire new text is considered as the new content. Calculate the similarity between the old content and new content using a threshold. If the similarity score is above the threshold, the content is considered preserved and the new content along with its similarity score is returned. If the similarity score is below the threshold, the content has been modified too much and the old content along with its similarity score is returned.

Parameters:

Name Type Description Default
heading str

The heading of the old text.

required
old_content str

The content of the old text.

required
new_text str

The new text.

required
heading_similarity_threshold float

The threshold for heading similarity. Defaults to 0.7.

0.7
content_similarity_threshold float

The threshold for content similarity. Defaults to 0.8.

0.8

Returns:

Type Description
tuple[str, float]

tuple[str, float]: A tuple containing the new content and its similarity score.

Source code in docqa/core/markdown.py
def preserve_content(
    heading: str,
    old_content: str,
    new_text: str,
    heading_similarity_threshold: float = 0.7,
    content_similarity_threshold: float = 0.8,
) -> tuple[str, float]:
    """
    Calculate the similarity between the given heading and new heading using a
    threshold. If the similarity score is above the threshold, the new text
    still contains the heading, so the content after the heading is extracted as the new
    content. If the similarity score is below the threshold, the new text does
    not contain the heading, so the entire new text is considered as the new content.
    Calculate the similarity between the old content and new content using a threshold.
    If the similarity score is above the threshold, the content is considered preserved
    and the new content along with its similarity score is returned.
    If the similarity score is below the threshold, the content has been modified too
    much and the old content along with its similarity score is returned.

    Args:
        heading (str): The heading of the old text.
        old_content (str): The content of the old text.
        new_text (str): The new text.
        heading_similarity_threshold (float, optional): The threshold for heading
            similarity. Defaults to 0.7.
        content_similarity_threshold (float, optional): The threshold for content
            similarity. Defaults to 0.8.

    Returns:
        tuple[str, float]: A tuple containing the new content and its similarity score.
    """
    parts = new_text.split("\n")
    new_heading = parts[0]

    heading_similarity = heading_similarity_score(heading, new_heading)
    if heading_similarity >= heading_similarity_threshold:
        # new text still contains heading
        new_content = "\n".join(parts[1:])
    else:
        # new text does not contain heading
        new_content = new_text

    content_similarity = text_similarity_score(old_content, new_content)
    if content_similarity >= content_similarity_threshold:
        # content is still preserved
        return new_content, content_similarity
    else:
        # content has been modified too much
        return old_content, content_similarity

tidy_markdown_sections

tidy_markdown_sections(sections, max_length=4096, openai_key='', openai_model='', seed=42, heading_similarity_threshold=0.7, content_similarity_threshold=0.8)

Tidies up sections of markdown text by splitting them into heading and content, and then processing each section using the MarkdownTidier class. It takes a list of tuples representing the sections, where each tuple contains a heading and content. The function also accepts optional parameters such as the maximum length of the tidied sections, the OpenAI API key, the OpenAI model to use, a seed value for reproducibility, and thresholds for heading and content similarity.

Parameters:

Name Type Description Default
sections list[tuple[str, str]]

A list of tuples representing the sections of markdown text. Each tuple contains a heading and content.

required
max_length int

The maximum length of the tidied sections. Defaults to 4096.

4096
openai_key str

The OpenAI API key. Defaults to "".

''
openai_model str

The OpenAI model to use. Defaults to "".

''
seed int

A seed value for reproducibility. Defaults to 42.

42
heading_similarity_threshold float

The threshold for heading similarity. Defaults to 0.7.

0.7
content_similarity_threshold float

The threshold for content similarity. Defaults to 0.8.

0.8

Returns:

Type Description
tuple[list[tuple[str, str]], list[dict]]

tuple[list[tuple[str, str]], list[dict]]: A tuple containing the tidied sections and a list of metadata for each section.

Source code in docqa/core/markdown.py
def tidy_markdown_sections(
    sections: list[tuple[str, str]],
    max_length: int = 4096,
    openai_key: str = "",
    openai_model: str = "",
    seed: int = 42,
    heading_similarity_threshold: float = 0.7,
    content_similarity_threshold: float = 0.8,
) -> tuple[list[tuple[str, str]], list[dict]]:
    """
    Tidies up sections of markdown text by splitting them into heading and content, and
    then processing each section using the MarkdownTidier class. It takes a list of
    tuples representing the sections, where each tuple contains a heading and
    content. The function also accepts optional parameters such as the maximum
    length of the tidied sections, the OpenAI API key, the OpenAI model to use, a
    seed value for reproducibility, and thresholds for heading and content similarity.

    Args:
        sections (list[tuple[str, str]]): A list of tuples representing the sections of
            markdown text. Each tuple contains a heading and content.
        max_length (int, optional): The maximum length of the tidied sections.
            Defaults to 4096.
        openai_key (str, optional): The OpenAI API key. Defaults to "".
        openai_model (str, optional): The OpenAI model to use. Defaults to "".
        seed (int, optional): A seed value for reproducibility. Defaults to 42.
        heading_similarity_threshold (float, optional): The threshold for heading
            similarity. Defaults to 0.7.
        content_similarity_threshold (float, optional): The threshold for content
            similarity. Defaults to 0.8.

    Returns:
        tuple[list[tuple[str, str]], list[dict]]: A tuple containing the tidied
            sections and a list of metadata for each section.
    """
    tidier = MarkdownTidier(openai_key=openai_key, openai_model=openai_model, seed=seed)
    encoding = tiktoken.encoding_for_model(tidier.openai_model)

    tidy_sections = []
    all_metadata: list[dict] = []
    for heading, content in sections:
        print("Tidying:", heading)
        section_text = f"{heading}\n\n{content}"
        if len(encoding.encode(section_text)) > max_length:
            tidy_sections.append((heading, content))
            all_metadata.append({})
        else:
            new_section_text, metadata = tidier.process(section_text)
            new_content, similarty = preserve_content(
                heading,
                content,
                new_section_text,
                heading_similarity_threshold=heading_similarity_threshold,
                content_similarity_threshold=content_similarity_threshold,
            )
            print("\tcontent similarity:", similarty)
            tidy_sections.append((heading, new_content))
            all_metadata.append(metadata)

    return tidy_sections, all_metadata