Module pyparlaclarin.refine

Modify and curate Parla-Clarin documents

Expand source code
"""
Modify and curate Parla-Clarin documents
"""
import random as _random

from lxml import etree as _etree


def _iter(root, ns="{http://www.tei-c.org/ns/1.0}"):
    for body in root.findall(".//" + ns + "body"):
        for div in body.findall(ns + "div"):
            for ix, elem in enumerate(div):
                if elem.tag == ns + "u":
                    yield "u", elem
                elif elem.tag == ns + "note":
                    yield "note", elem
                elif elem.tag == ns + "pb":
                    yield "pb", elem
                elif elem.tag == ns + "seg":
                    yield "seg", elem
                elif elem.tag == "u":
                    elem.tag = ns + "u"
                    yield "u", elem
                else:
                    print(elem.tag)
                    yield None


def random_classifier(paragraph):
    alternatives = ["note", "u"]
    return _random.choice(alternatives)


def reclassify(root, classifier, tei="{http://www.tei-c.org/ns/1.0}", exclude=[]):
    """
    Reclassify nodes in a Parla-Clarin tree.

    Args:
        root: root of the lxml tree to be reclassified
        classifier: lambda function that classifies paragraphs. str->str,
            takes paragraph content as input, outputs predicted xml tag, such
            as note or u.
        tei: namespace for the output xml
        exclude: exclude certain tags or types of element from reclassification
    """
    prev_elem = None
    for ix, elem_tuple in enumerate(list(_iter(root))):
        tag, elem = elem_tuple

        prev_elem = elem
        if tag == "u" and tag not in exclude:
            for seg in elem:
                paragraph = seg.text
                c = classifier(paragraph)
                if c != "u":
                    print("Change u to note")
                    prev_elem.addnext(seg)
                    prev_elem = seg
                    seg.tag = tei + c
                elif prev_elem != elem:
                    if prev_elem.tag == tei + "u":
                        prev_elem.append(seg)
                    else:
                        new_elem = _etree.Element(tei + "u")
                        prev_elem.addnext(new_elem)
                        prev_elem = new_elem
                        prev_elem.append(seg)
                else:
                    pass

        elif tag == "note" and tag not in exclude and elem.attrib.get("type") not in exclude:
            paragraph = elem.text
            c = classifier(paragraph)
            if c != tag:
                if c == "u":
                    elem.tag = tei + "seg"
                    if prev_elem.tag == tei + "u":
                        print("Change note to u")
                    else:
                        # Create new u node
                        new_elem = _etree.Element(tei + c)
                        prev_elem.addnext(new_elem)
                        prev_elem = new_elem

                    prev_elem.append(elem)

                else:
                    prev_elem = elem
                    elem.tag = tei + c
            else:
                prev_elem = elem
        else:
            prev_elem = elem
    return root


def format_paragraph(paragraph, spaces=12):
    """
    Formats paragraphs to be equal in width.

    Args:
        paragraph: paragraph content, str.
        spaces: size of indentation as number of spaces.
    """
    words = paragraph.replace("\n", "").strip().split()
    s = "\n" + " " * spaces
    row = ""

    for word in words:
        if len(row) > 60:
            s += row.strip() + "\n" + " " * spaces
            row = word
        else:
            row += " " + word

    if len(row.strip()) > 0:
        s += row.strip() + "\n" + " " * (spaces - 2)

    if s.strip() == "":
        return None
    return s


def format_texts(root):
    """
    Formats all text elements in a Parla-Clarin document.

    Args:
        root: Parla-Clarin document as an lxml tree root.
    """
    for tag, elem in _iter(root):

        # Format notes' text content
        # Remove notes with no text content
        if tag == "note":
            if type(elem.text) == str:
                elem.text = format_paragraph(elem.text)
            else:
                elem.text = None
            if elem.text is None:
                elem.getparent().remove(elem)

        # Remove u's with no children
        elif tag == "u":
            if len("".join(elem.itertext())) == 0:
                elem.getparent().remove(elem)
            elif len(list(elem)) > 0:
                # Format segs' text content
                # Remove segs with no text content
                for seg in elem:
                    if type(seg.text) == str:
                        seg.text = format_paragraph(seg.text, spaces=14)
                    else:
                        seg.text = None
                    if seg.text is None:
                        seg.getparent().remove(seg)
                elem.text = None
            else:
                elem.getparent().remove(elem)

        # Use facs attribute instead of xml:url
        elif tag == "pb":
            if "{http://www.w3.org/XML/1998/namespace}url" in elem.attrib:
                url = elem.attrib["{http://www.w3.org/XML/1998/namespace}url"]
                del elem.attrib["{http://www.w3.org/XML/1998/namespace}url"]
                elem.attrib["facs"] = url
    

    return root

Functions

def format_paragraph(paragraph, spaces=12)

Formats paragraphs to be equal in width.

Args

paragraph
paragraph content, str.
spaces
size of indentation as number of spaces.
Expand source code
def format_paragraph(paragraph, spaces=12):
    """
    Formats paragraphs to be equal in width.

    Args:
        paragraph: paragraph content, str.
        spaces: size of indentation as number of spaces.
    """
    words = paragraph.replace("\n", "").strip().split()
    s = "\n" + " " * spaces
    row = ""

    for word in words:
        if len(row) > 60:
            s += row.strip() + "\n" + " " * spaces
            row = word
        else:
            row += " " + word

    if len(row.strip()) > 0:
        s += row.strip() + "\n" + " " * (spaces - 2)

    if s.strip() == "":
        return None
    return s
def format_texts(root)

Formats all text elements in a Parla-Clarin document.

Args

root
Parla-Clarin document as an lxml tree root.
Expand source code
def format_texts(root):
    """
    Formats all text elements in a Parla-Clarin document.

    Args:
        root: Parla-Clarin document as an lxml tree root.
    """
    for tag, elem in _iter(root):

        # Format notes' text content
        # Remove notes with no text content
        if tag == "note":
            if type(elem.text) == str:
                elem.text = format_paragraph(elem.text)
            else:
                elem.text = None
            if elem.text is None:
                elem.getparent().remove(elem)

        # Remove u's with no children
        elif tag == "u":
            if len("".join(elem.itertext())) == 0:
                elem.getparent().remove(elem)
            elif len(list(elem)) > 0:
                # Format segs' text content
                # Remove segs with no text content
                for seg in elem:
                    if type(seg.text) == str:
                        seg.text = format_paragraph(seg.text, spaces=14)
                    else:
                        seg.text = None
                    if seg.text is None:
                        seg.getparent().remove(seg)
                elem.text = None
            else:
                elem.getparent().remove(elem)

        # Use facs attribute instead of xml:url
        elif tag == "pb":
            if "{http://www.w3.org/XML/1998/namespace}url" in elem.attrib:
                url = elem.attrib["{http://www.w3.org/XML/1998/namespace}url"]
                del elem.attrib["{http://www.w3.org/XML/1998/namespace}url"]
                elem.attrib["facs"] = url
    

    return root
def random_classifier(paragraph)
Expand source code
def random_classifier(paragraph):
    alternatives = ["note", "u"]
    return _random.choice(alternatives)
def reclassify(root, classifier, tei='{http://www.tei-c.org/ns/1.0}', exclude=[])

Reclassify nodes in a Parla-Clarin tree.

Args

root
root of the lxml tree to be reclassified
classifier
lambda function that classifies paragraphs. str->str, takes paragraph content as input, outputs predicted xml tag, such as note or u.
tei
namespace for the output xml
exclude
exclude certain tags or types of element from reclassification
Expand source code
def reclassify(root, classifier, tei="{http://www.tei-c.org/ns/1.0}", exclude=[]):
    """
    Reclassify nodes in a Parla-Clarin tree.

    Args:
        root: root of the lxml tree to be reclassified
        classifier: lambda function that classifies paragraphs. str->str,
            takes paragraph content as input, outputs predicted xml tag, such
            as note or u.
        tei: namespace for the output xml
        exclude: exclude certain tags or types of element from reclassification
    """
    prev_elem = None
    for ix, elem_tuple in enumerate(list(_iter(root))):
        tag, elem = elem_tuple

        prev_elem = elem
        if tag == "u" and tag not in exclude:
            for seg in elem:
                paragraph = seg.text
                c = classifier(paragraph)
                if c != "u":
                    print("Change u to note")
                    prev_elem.addnext(seg)
                    prev_elem = seg
                    seg.tag = tei + c
                elif prev_elem != elem:
                    if prev_elem.tag == tei + "u":
                        prev_elem.append(seg)
                    else:
                        new_elem = _etree.Element(tei + "u")
                        prev_elem.addnext(new_elem)
                        prev_elem = new_elem
                        prev_elem.append(seg)
                else:
                    pass

        elif tag == "note" and tag not in exclude and elem.attrib.get("type") not in exclude:
            paragraph = elem.text
            c = classifier(paragraph)
            if c != tag:
                if c == "u":
                    elem.tag = tei + "seg"
                    if prev_elem.tag == tei + "u":
                        print("Change note to u")
                    else:
                        # Create new u node
                        new_elem = _etree.Element(tei + c)
                        prev_elem.addnext(new_elem)
                        prev_elem = new_elem

                    prev_elem.append(elem)

                else:
                    prev_elem = elem
                    elem.tag = tei + c
            else:
                prev_elem = elem
        else:
            prev_elem = elem
    return root