Module `clu.phontools.alignment.lbe`

Expand source code

# coding: utf-8
from enum import Enum
from clu.phontools.struct import Phrase, CoarseStress, Stress
from typing import Any, Dict, List, Text, Sequence
from pydantic import BaseModel
from pydantic.dataclasses import dataclass
import json


class LexicalBoundaryErrorType(Enum):
    """An enumeration of all of the lexical boundary (LB) error types."""

    INSERTION_WEAK = "IW"
    """insertion before weak syllable"""

    INSERTION_STRONG = "IS"
    """insertion before strong syllable"""

    DELETION_WEAK = "DW"
    """deletion before weak syllable"""

    DELETION_STRONG = "DS"
    """deletion before strong syllable"""

    UNKNOWN = "UNK"
    """unknown LBE error"""


class LexicalBoundaryError(BaseModel):
    """
    Encodes a single Lexical Boundary error
    """

    error_type: LexicalBoundaryErrorType
    target_index: int
    transcript_index: int

    def to_tuple(self):
        return (self.error_type, self.target_index, self.transcript_index)

    def to_dict(self):
        return {
            "target_index": self.target_index,
            "transcript_index": self.transcript_index,
            "error_type": self.error_type.name,
        }


class LexicalBoundaryErrorReport(BaseModel):
    # list attributes here
    target: Phrase
    transcript: Phrase
    lbes: Sequence[LexicalBoundaryError]

    def to_dict(self) -> Dict[Text, Any]:
        # TODO: implement me by using class atributes

        # returns a json string (3 keys)
        return {
            "target_stress": self.target_stress,
            "transcript_stress": self.transcript_stress,
            "lbes": [lbe.to_dict() for lbe in self.lbes],  # list of dicts
        }


def calculate_lbes_from_phrases(
    target_phrase: Phrase, transcript_phrase: Phrase
) -> LexicalBoundaryErrorReport:
    """Calculates lexical boundary errors from stress-based syllable structures via a pair of `clu.phontools.struct.Phrase` using rules described in [Jiao et al. (2019)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6808349/pdf/JSLHR-62-3359.pdf#page=4)"""
    target: Sequence[Text] = target_phrase.coarse_stress
    # FIXME: should these be masked?
    transcript: Sequence[Text] = target_phrase.mask_syllables(mask="X")
    errors: Sequence[LexicalBoundaryError] = calculate_lbes_from_stress(
        target, transcript
    )
    return LexicalBoundaryErrorReport(
        target=target_phrase, transcript=transcript_phrase, lbes=errors
    )


def calculate_lbes_from_stress(
    target: Sequence[Text], transcript: Sequence[Text]
) -> Sequence[LexicalBoundaryError]:
    """Calculates lexical boundary errors from stress-based syllable structures using rules described in [(Jiao et al., 2019)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6808349/pdf/JSLHR-62-3359.pdf#page=4)

    :param target: A sequence of syllables in terms of strong (S) or weak (W) stress (ex. ["SW", "S"]) representing the target to which we're comparing.
    :param transcript: A sequence of syllables in terms of strong (S) or weak (W) stress (ex. ["SW", "S"]) corresponding to some transcript.
    """
    target_remaining = [(tok, i) for (i, tok) in enumerate(target)]
    transcript_remaining = [(tok, i) for (i, tok) in enumerate(transcript)]

    errors = []
    while len(target_remaining) > 0 and len(transcript_remaining) > 0:
        target_term, target_idx = target_remaining[0]
        transcript_term, transcript_idx = transcript_remaining[0]

        # base case. no error
        if len(target_term) == len(transcript_term):
            # advance both sequences
            target_remaining = target_remaining[1:]
            transcript_remaining = transcript_remaining[1:]

        # length mismatch (insertion case)
        elif len(target_term) > len(transcript_term):
            # 1. remove the num. syllables from transcript from target
            target_remaining[0] = (target_term[len(transcript_term) :], target_idx)
            # 2. advance transcript by 1 word
            transcript_remaining = transcript_remaining[1:]
            # get new current symbols
            target_term, target_idx = target_remaining[0]
            # 3. categorize insertion error
            if target_term[0] == CoarseStress.WEAK.value:
                error = LexicalBoundaryError(
                    error_type=LexicalBoundaryErrorType.INSERTION_WEAK,
                    target_index=target_idx,
                    transcript_index=transcript_idx,
                )
            elif target_term[0] == CoarseStress.STRONG.value:
                error = LexicalBoundaryError(
                    error_type=LexicalBoundaryErrorType.INSERTION_STRONG,
                    target_index=target_idx,
                    transcript_index=transcript_idx,
                )
            errors.append(error)

        # length mismatch (deletion case)
        elif len(target_term) < len(transcript_term):
            # 1. remove the num. syllables from target from transcript
            transcript_remaining[0] = (
                transcript_term[len(target_term) :],
                transcript_idx,
            )
            # 2. advance target
            target_remaining = target_remaining[1:]
            # get new current symbols
            if len(target_remaining) > 0:
                target_term, target_idx = target_remaining[0]
            if len(transcript_remaining) > 0:
                transcript_term, transcript_idx = transcript_remaining[0]

            # 3. categorize deletion error
            if target_term[0] == CoarseStress.WEAK.value:
                error = LexicalBoundaryError(
                    error_type=LexicalBoundaryErrorType.DELETION_WEAK,
                    target_index=target_idx,
                    transcript_index=transcript_idx,
                )
            elif target_term[0] == CoarseStress.STRONG.value:
                error = LexicalBoundaryError(
                    error_type=LexicalBoundaryErrorType.DELETION_STRONG,
                    target_index=target_idx,
                    transcript_index=transcript_idx,
                )
            errors.append(error)
        # FIXME:
        # transcript_term = transcript_term[1:]
        # transcript_remaining[0] = (transcript_term, transcript_idx)
        # error = LexicalBoundaryError("deletion", target_idx, transcript_idx)
        # errors.append(error)

    # if we still have transcript tokens remaining, append them ...
    for (_, idx) in transcript_remaining:
        error = LexicalBoundaryError(
            error_type=LexicalBoundaryErrorType.UNKNOWN,
            target_index=-1,
            transcript_index=idx,
        )
        errors.append(error)
    # print(f"\ntarget:     {target}")
    # print(f"transcript: {transcript}")
    return errors

Functions

def calculate_lbes_from_phrases(target_phrase: Phrase, transcript_phrase: Phrase) ‑> LexicalBoundaryErrorReport

Calculates lexical boundary errors from stress-based syllable structures via a pair of Phrase using rules described in Jiao et al. (2019)

Expand source code

def calculate_lbes_from_phrases(
    target_phrase: Phrase, transcript_phrase: Phrase
) -> LexicalBoundaryErrorReport:
    """Calculates lexical boundary errors from stress-based syllable structures via a pair of `clu.phontools.struct.Phrase` using rules described in [Jiao et al. (2019)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6808349/pdf/JSLHR-62-3359.pdf#page=4)"""
    target: Sequence[Text] = target_phrase.coarse_stress
    # FIXME: should these be masked?
    transcript: Sequence[Text] = target_phrase.mask_syllables(mask="X")
    errors: Sequence[LexicalBoundaryError] = calculate_lbes_from_stress(
        target, transcript
    )
    return LexicalBoundaryErrorReport(
        target=target_phrase, transcript=transcript_phrase, lbes=errors
    )

def calculate_lbes_from_stress(target: Sequence[str], transcript: Sequence[str]) ‑> Sequence[LexicalBoundaryError]

Calculates lexical boundary errors from stress-based syllable structures using rules described in (Jiao et al., 2019)

:param target: A sequence of syllables in terms of strong (S) or weak (W) stress (ex. ["SW", "S"]) representing the target to which we're comparing. :param transcript: A sequence of syllables in terms of strong (S) or weak (W) stress (ex. ["SW", "S"]) corresponding to some transcript.

Expand source code

def calculate_lbes_from_stress(
    target: Sequence[Text], transcript: Sequence[Text]
) -> Sequence[LexicalBoundaryError]:
    """Calculates lexical boundary errors from stress-based syllable structures using rules described in [(Jiao et al., 2019)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6808349/pdf/JSLHR-62-3359.pdf#page=4)

    :param target: A sequence of syllables in terms of strong (S) or weak (W) stress (ex. ["SW", "S"]) representing the target to which we're comparing.
    :param transcript: A sequence of syllables in terms of strong (S) or weak (W) stress (ex. ["SW", "S"]) corresponding to some transcript.
    """
    target_remaining = [(tok, i) for (i, tok) in enumerate(target)]
    transcript_remaining = [(tok, i) for (i, tok) in enumerate(transcript)]

    errors = []
    while len(target_remaining) > 0 and len(transcript_remaining) > 0:
        target_term, target_idx = target_remaining[0]
        transcript_term, transcript_idx = transcript_remaining[0]

        # base case. no error
        if len(target_term) == len(transcript_term):
            # advance both sequences
            target_remaining = target_remaining[1:]
            transcript_remaining = transcript_remaining[1:]

        # length mismatch (insertion case)
        elif len(target_term) > len(transcript_term):
            # 1. remove the num. syllables from transcript from target
            target_remaining[0] = (target_term[len(transcript_term) :], target_idx)
            # 2. advance transcript by 1 word
            transcript_remaining = transcript_remaining[1:]
            # get new current symbols
            target_term, target_idx = target_remaining[0]
            # 3. categorize insertion error
            if target_term[0] == CoarseStress.WEAK.value:
                error = LexicalBoundaryError(
                    error_type=LexicalBoundaryErrorType.INSERTION_WEAK,
                    target_index=target_idx,
                    transcript_index=transcript_idx,
                )
            elif target_term[0] == CoarseStress.STRONG.value:
                error = LexicalBoundaryError(
                    error_type=LexicalBoundaryErrorType.INSERTION_STRONG,
                    target_index=target_idx,
                    transcript_index=transcript_idx,
                )
            errors.append(error)

        # length mismatch (deletion case)
        elif len(target_term) < len(transcript_term):
            # 1. remove the num. syllables from target from transcript
            transcript_remaining[0] = (
                transcript_term[len(target_term) :],
                transcript_idx,
            )
            # 2. advance target
            target_remaining = target_remaining[1:]
            # get new current symbols
            if len(target_remaining) > 0:
                target_term, target_idx = target_remaining[0]
            if len(transcript_remaining) > 0:
                transcript_term, transcript_idx = transcript_remaining[0]

            # 3. categorize deletion error
            if target_term[0] == CoarseStress.WEAK.value:
                error = LexicalBoundaryError(
                    error_type=LexicalBoundaryErrorType.DELETION_WEAK,
                    target_index=target_idx,
                    transcript_index=transcript_idx,
                )
            elif target_term[0] == CoarseStress.STRONG.value:
                error = LexicalBoundaryError(
                    error_type=LexicalBoundaryErrorType.DELETION_STRONG,
                    target_index=target_idx,
                    transcript_index=transcript_idx,
                )
            errors.append(error)
        # FIXME:
        # transcript_term = transcript_term[1:]
        # transcript_remaining[0] = (transcript_term, transcript_idx)
        # error = LexicalBoundaryError("deletion", target_idx, transcript_idx)
        # errors.append(error)

    # if we still have transcript tokens remaining, append them ...
    for (_, idx) in transcript_remaining:
        error = LexicalBoundaryError(
            error_type=LexicalBoundaryErrorType.UNKNOWN,
            target_index=-1,
            transcript_index=idx,
        )
        errors.append(error)
    # print(f"\ntarget:     {target}")
    # print(f"transcript: {transcript}")
    return errors

Classes

class LexicalBoundaryError (**data: Any)

Encodes a single Lexical Boundary error

Create a new model by parsing and validating input data from keyword arguments.

Raises ValidationError if the input data cannot be parsed to form a valid model.

Expand source code

class LexicalBoundaryError(BaseModel):
    """
    Encodes a single Lexical Boundary error
    """

    error_type: LexicalBoundaryErrorType
    target_index: int
    transcript_index: int

    def to_tuple(self):
        return (self.error_type, self.target_index, self.transcript_index)

    def to_dict(self):
        return {
            "target_index": self.target_index,
            "transcript_index": self.transcript_index,
            "error_type": self.error_type.name,
        }

Ancestors

pydantic.main.BaseModel
pydantic.utils.Representation

Class variables

var error_type : LexicalBoundaryErrorType
var target_index : int
var transcript_index : int

Methods

def to_dict(self)

Expand source code

def to_dict(self):
    return {
        "target_index": self.target_index,
        "transcript_index": self.transcript_index,
        "error_type": self.error_type.name,
    }

def to_tuple(self)

Expand source code

def to_tuple(self):
    return (self.error_type, self.target_index, self.transcript_index)

class LexicalBoundaryErrorReport (**data: Any)

Create a new model by parsing and validating input data from keyword arguments.

Raises ValidationError if the input data cannot be parsed to form a valid model.

Expand source code

class LexicalBoundaryErrorReport(BaseModel):
    # list attributes here
    target: Phrase
    transcript: Phrase
    lbes: Sequence[LexicalBoundaryError]

    def to_dict(self) -> Dict[Text, Any]:
        # TODO: implement me by using class atributes

        # returns a json string (3 keys)
        return {
            "target_stress": self.target_stress,
            "transcript_stress": self.transcript_stress,
            "lbes": [lbe.to_dict() for lbe in self.lbes],  # list of dicts
        }

Ancestors

pydantic.main.BaseModel
pydantic.utils.Representation

Class variables

var lbes : Sequence[LexicalBoundaryError]
var target : Phrase
var transcript : Phrase

Methods

def to_dict(self) ‑> Dict[str, Any]

Expand source code

def to_dict(self) -> Dict[Text, Any]:
    # TODO: implement me by using class atributes

    # returns a json string (3 keys)
    return {
        "target_stress": self.target_stress,
        "transcript_stress": self.transcript_stress,
        "lbes": [lbe.to_dict() for lbe in self.lbes],  # list of dicts
    }

class LexicalBoundaryErrorType (value, names=None, *, module=None, qualname=None, type=None, start=1)

An enumeration of all of the lexical boundary (LB) error types.

Expand source code

class LexicalBoundaryErrorType(Enum):
    """An enumeration of all of the lexical boundary (LB) error types."""

    INSERTION_WEAK = "IW"
    """insertion before weak syllable"""

    INSERTION_STRONG = "IS"
    """insertion before strong syllable"""

    DELETION_WEAK = "DW"
    """deletion before weak syllable"""

    DELETION_STRONG = "DS"
    """deletion before strong syllable"""

    UNKNOWN = "UNK"
    """unknown LBE error"""

Ancestors

enum.Enum

Class variables

var DELETION_STRONG: deletion before strong syllable
var DELETION_WEAK: deletion before weak syllable
var INSERTION_STRONG: insertion before strong syllable
var INSERTION_WEAK: insertion before weak syllable
var UNKNOWN: unknown LBE error