Module `pyparlaclarin.refine`

Modify and curate Parla-Clarin documents

Expand source code

"""
Modify and curate Parla-Clarin documents
"""
import random as _random

from lxml import etree as _etree


def _iter(root, ns="{http://www.tei-c.org/ns/1.0}"):
    for body in root.findall(".//" + ns + "body"):
        for div in body.findall(ns + "div"):
            for ix, elem in enumerate(div):
                if elem.tag == ns + "u":
                    yield "u", elem
                elif elem.tag == ns + "note":
                    yield "note", elem
                elif elem.tag == ns + "pb":
                    yield "pb", elem
                elif elem.tag == ns + "seg":
                    yield "seg", elem
                elif elem.tag == "u":
                    elem.tag = ns + "u"
                    yield "u", elem
                else:
                    print(elem.tag)
                    yield None


def random_classifier(paragraph):
    alternatives = ["note", "u"]
    return _random.choice(alternatives)


def reclassify(root, classifier, tei="{http://www.tei-c.org/ns/1.0}", exclude=[]):
    """
    Reclassify nodes in a Parla-Clarin tree.

    Args:
        root: root of the lxml tree to be reclassified
        classifier: lambda function that classifies paragraphs. str->str,
            takes paragraph content as input, outputs predicted xml tag, such
            as note or u.
        tei: namespace for the output xml
        exclude: exclude certain tags or types of element from reclassification
    """
    prev_elem = None
    for ix, elem_tuple in enumerate(list(_iter(root))):
        tag, elem = elem_tuple

        prev_elem = elem
        if tag == "u" and tag not in exclude:
            for seg in elem:
                paragraph = seg.text
                c = classifier(paragraph)
                if c != "u":
                    print("Change u to note")
                    prev_elem.addnext(seg)
                    prev_elem = seg
                    seg.tag = tei + c
                elif prev_elem != elem:
                    if prev_elem.tag == tei + "u":
                        prev_elem.append(seg)
                    else:
                        new_elem = _etree.Element(tei + "u")
                        prev_elem.addnext(new_elem)
                        prev_elem = new_elem
                        prev_elem.append(seg)
                else:
                    pass

        elif tag == "note" and tag not in exclude and elem.attrib.get("type") not in exclude:
            paragraph = elem.text
            c = classifier(paragraph)
            if c != tag:
                if c == "u":
                    elem.tag = tei + "seg"
                    if prev_elem.tag == tei + "u":
                        print("Change note to u")
                    else:
                        # Create new u node
                        new_elem = _etree.Element(tei + c)
                        prev_elem.addnext(new_elem)
                        prev_elem = new_elem

                    prev_elem.append(elem)

                else:
                    prev_elem = elem
                    elem.tag = tei + c
            else:
                prev_elem = elem
        else:
            prev_elem = elem
    return root


def format_paragraph(paragraph, spaces=12):
    """
    Formats paragraphs to be equal in width.

    Args:
        paragraph: paragraph content, str.
        spaces: size of indentation as number of spaces.
    """
    words = paragraph.replace("\n", "").strip().split()
    s = "\n" + " " * spaces
    row = ""

    for word in words:
        if len(row) > 60:
            s += row.strip() + "\n" + " " * spaces
            row = word
        else:
            row += " " + word

    if len(row.strip()) > 0:
        s += row.strip() + "\n" + " " * (spaces - 2)

    if s.strip() == "":
        return None
    return s


def format_texts(root):
    """
    Formats all text elements in a Parla-Clarin document.

    Args:
        root: Parla-Clarin document as an lxml tree root.
    """
    for tag, elem in _iter(root):

        # Format notes' text content
        # Remove notes with no text content
        if tag == "note":
            if type(elem.text) == str:
                elem.text = format_paragraph(elem.text)
            else:
                elem.text = None
            if elem.text is None:
                elem.getparent().remove(elem)

        # Remove u's with no children
        elif tag == "u":
            if len("".join(elem.itertext())) == 0:
                elem.getparent().remove(elem)
            elif len(list(elem)) > 0:
                # Format segs' text content
                # Remove segs with no text content
                for seg in elem:
                    if type(seg.text) == str:
                        seg.text = format_paragraph(seg.text, spaces=14)
                    else:
                        seg.text = None
                    if seg.text is None:
                        seg.getparent().remove(seg)
                elem.text = None
            else:
                elem.getparent().remove(elem)

        # Use facs attribute instead of xml:url
        elif tag == "pb":
            if "{http://www.w3.org/XML/1998/namespace}url" in elem.attrib:
                url = elem.attrib["{http://www.w3.org/XML/1998/namespace}url"]
                del elem.attrib["{http://www.w3.org/XML/1998/namespace}url"]
                elem.attrib["facs"] = url
    

    return root

Functions

def format_paragraph(paragraph, spaces=12)

Formats paragraphs to be equal in width.

Args

paragraph: paragraph content, str.
spaces: size of indentation as number of spaces.

Expand source code

def format_paragraph(paragraph, spaces=12):
    """
    Formats paragraphs to be equal in width.

    Args:
        paragraph: paragraph content, str.
        spaces: size of indentation as number of spaces.
    """
    words = paragraph.replace("\n", "").strip().split()
    s = "\n" + " " * spaces
    row = ""

    for word in words:
        if len(row) > 60:
            s += row.strip() + "\n" + " " * spaces
            row = word
        else:
            row += " " + word

    if len(row.strip()) > 0:
        s += row.strip() + "\n" + " " * (spaces - 2)

    if s.strip() == "":
        return None
    return s

def format_texts(root)

Formats all text elements in a Parla-Clarin document.

Args

root: Parla-Clarin document as an lxml tree root.

Expand source code

def format_texts(root):
    """
    Formats all text elements in a Parla-Clarin document.

    Args:
        root: Parla-Clarin document as an lxml tree root.
    """
    for tag, elem in _iter(root):

        # Format notes' text content
        # Remove notes with no text content
        if tag == "note":
            if type(elem.text) == str:
                elem.text = format_paragraph(elem.text)
            else:
                elem.text = None
            if elem.text is None:
                elem.getparent().remove(elem)

        # Remove u's with no children
        elif tag == "u":
            if len("".join(elem.itertext())) == 0:
                elem.getparent().remove(elem)
            elif len(list(elem)) > 0:
                # Format segs' text content
                # Remove segs with no text content
                for seg in elem:
                    if type(seg.text) == str:
                        seg.text = format_paragraph(seg.text, spaces=14)
                    else:
                        seg.text = None
                    if seg.text is None:
                        seg.getparent().remove(seg)
                elem.text = None
            else:
                elem.getparent().remove(elem)

        # Use facs attribute instead of xml:url
        elif tag == "pb":
            if "{http://www.w3.org/XML/1998/namespace}url" in elem.attrib:
                url = elem.attrib["{http://www.w3.org/XML/1998/namespace}url"]
                del elem.attrib["{http://www.w3.org/XML/1998/namespace}url"]
                elem.attrib["facs"] = url
    

    return root

def random_classifier(paragraph)

Expand source code

def random_classifier(paragraph):
    alternatives = ["note", "u"]
    return _random.choice(alternatives)

def reclassify(root, classifier, tei='{http://www.tei-c.org/ns/1.0}', exclude=[])

Reclassify nodes in a Parla-Clarin tree.

Args

root: root of the lxml tree to be reclassified
classifier: lambda function that classifies paragraphs. str->str, takes paragraph content as input, outputs predicted xml tag, such as note or u.
tei: namespace for the output xml
exclude: exclude certain tags or types of element from reclassification

Expand source code

def reclassify(root, classifier, tei="{http://www.tei-c.org/ns/1.0}", exclude=[]):
    """
    Reclassify nodes in a Parla-Clarin tree.

    Args:
        root: root of the lxml tree to be reclassified
        classifier: lambda function that classifies paragraphs. str->str,
            takes paragraph content as input, outputs predicted xml tag, such
            as note or u.
        tei: namespace for the output xml
        exclude: exclude certain tags or types of element from reclassification
    """
    prev_elem = None
    for ix, elem_tuple in enumerate(list(_iter(root))):
        tag, elem = elem_tuple

        prev_elem = elem
        if tag == "u" and tag not in exclude:
            for seg in elem:
                paragraph = seg.text
                c = classifier(paragraph)
                if c != "u":
                    print("Change u to note")
                    prev_elem.addnext(seg)
                    prev_elem = seg
                    seg.tag = tei + c
                elif prev_elem != elem:
                    if prev_elem.tag == tei + "u":
                        prev_elem.append(seg)
                    else:
                        new_elem = _etree.Element(tei + "u")
                        prev_elem.addnext(new_elem)
                        prev_elem = new_elem
                        prev_elem.append(seg)
                else:
                    pass

        elif tag == "note" and tag not in exclude and elem.attrib.get("type") not in exclude:
            paragraph = elem.text
            c = classifier(paragraph)
            if c != tag:
                if c == "u":
                    elem.tag = tei + "seg"
                    if prev_elem.tag == tei + "u":
                        print("Change note to u")
                    else:
                        # Create new u node
                        new_elem = _etree.Element(tei + c)
                        prev_elem.addnext(new_elem)
                        prev_elem = new_elem

                    prev_elem.append(elem)

                else:
                    prev_elem = elem
                    elem.tag = tei + c
            else:
                prev_elem = elem
        else:
            prev_elem = elem
    return root