Module clu.phontools.pronouncing

Expand source code
from abc import ABC, abstractmethod
from typing import Dict, Text, Tuple, List, Optional, Sequence, Callable
from clu.phontools.struct import Pronunciation, SimpleWord, Stress
import os


ConverterFunc = Callable[[Text], Text]
identity: Callable[[Text], Text] = lambda x: x


class ConverterUtils:

    """Converter utilities to map between phonological symbol sets"""

    # FIXME: add more symbols
    # TODO: test cases
    arpabet_to_ipa_dict: Dict[Text, Text] = {
        "AA": "ɒ",
        "AE": "æ",
        # FIXME: this should translate to "ʌ" OR "ə" depending on the stress assignment
        "AH": "ʌ",
        "AO": "ɔ",
        "AW": "aʊ",
        "AY": "ai",
        "B": "b",
        "CH": "tʃ",
        "D": "d",
        "DH": "ð",
        "EH": "ɛ",
        "ER": "ə",
        "EY": "ei",
        "F": "f",
        "G": "g",
        "HH": "h",
        "IH": "i",
        "IY": "I",
        "JH": "dʒ",
        "K": "k",
        "L": "l",
        "M": "m",
        "N": "n",
        "NG": "ŋ",
        "OW": "oʊ",
        "OY": "ɔi",
        "P": "p",
        "R": "ɹ",
        "S": "s",
        "SH": "ʃ",
        "T": "t",
        "TH": "θ",
        "UH": "ʊ",
        "UW": "U",
        "V": "v",
        "W": "w",
        "Y": "j",
        "Z": "z",
        "ZH": "ʒ",
    }
    """Dictionary mapping arpabet symbols to IPA"""

    # reverse mapping
    # ipa_to_arpabet_dict = {v: k for k, v in arpabet_to_ipa.items()}

    def arpabet_to_ipa(symbol: Text) -> Text:
        """Converts an Arpabet symbol to IPA

        Example:
        from clu.phontools.pronouncing.ConverterUtils
        bell_arpa = ('B', 'EH1', 'L')
        bell_ipa = tuple(ConverterUtils.arpabet_to_ipa(symb) for symb in bell_arpa)
        # should produce ('b', 'ɛ', 'l')
        """
        stress: Optional[Text] = (
            None
            if symbol[-1]
            not in {
                Stress.PRIMARY.value,
                Stress.SECONDARY.value,
                Stress.NO_STRESS.value,
            }
            else symbol[-1]
        )
        base_form: Text = symbol if not stress else symbol[:-1]
        return ConverterUtils.arpabet_to_ipa_dict.get(base_form, base_form)

    def ipa_to_arpabet(symbol: Text) -> Text:
        """Converts an Arpabet symbol to IPA

        Example:
        from clu.phontools.pronouncing.ConverterUtils
        bell_ipa = ('b', 'ɛ', 'l')
        bell_arpa = tuple(ConverterUtils.ipa_to_arpabet(symb) for symb in bell_ipa)
        # should produce ('B', 'EH', 'L')
        """
        for (arpa, ipa) in ConverterUtils.arpabet_to_ipa_dict.items():
            if ipa == symbol:
                return arpa
        # in case of failure, parrot back symbol
        return symbol


class PronouncingDict(dict, ABC):
    """
    Maps tuples of pronunciations -> lexical entries
    """

    def __init__(self, pairs: List[Tuple[SimpleWord, Pronunciation]] = []):
        self._dict: Dict[Word, List[Pronunciation]] = self._generate_dict(pairs)

    @abstractmethod
    def stress_for(self, pronunciation: Pronunciation) -> Sequence[Stress]:
        """Returns the stress assignment for each phone in the pronunciation

        Subclasses of `clu.phontools.pronouncing.PronouncingDict` should implement `clu.phontools.pronouncing.PronouncingDict.stress_for`
        """
        pass

    def _preprocess_key(self, key: SimpleWord) -> SimpleWord:
        return key.lower()

    def keys(self):
        return self._dict.keys()

    def values(self):
        return self._dict.values()

    def items(self):
        return self._dict.items()

    def __iter__(self):
        return iter(self._dict)

    def __len__(self):
        return len(self._dict)

    def get(self, key: Text) -> SimpleWord:
        return self._dict.get(self._preprocess_key(key), [])

    def add(self, key: SimpleWord, value: Pronunciation) -> None:
        """Adds a pronunciation for the given word"""
        clean_key: SimpleWord = self._preprocess_key(key)
        old: List[Pronunciation] = self._dict.get(clean_key, [])
        self._dict[clean_key] = old + [value]

    def __getitem__(self, key: SimpleWord) -> List[Pronunciation]:
        return self._dict.__getitem__(key)

    def __setitem__(self, key: SimpleWord, value: List[Pronunciation]) -> None:
        self._dict.__setitem__(self, self._preprocess_key(key), value)

    def _generate_dict(
        self, pairs: List[Tuple[SimpleWord, Pronunciation]]
    ) -> Dict[SimpleWord, List[Pronunciation]]:
        pronounciation_dict = dict()
        for (k, v) in pairs:
            key = self._preprocess_key(k)
            pronunciations = pronounciation_dict.get(key, [])
            pronunciations.append(v)
            pronounciation_dict[key] = pronunciations
        return pronounciation_dict


class CMUPronouncingDict(PronouncingDict):
    def __init__(self, pairs: List[Tuple[SimpleWord, Pronunciation]] = []):
        super().__init__(pairs)

    def stress_for(self, pronunciation: Pronunciation) -> Sequence[Stress]:
        """Returns the stress assignment for each phone in the pronunciation"""
        stress_pattern = []
        for phone in pronunciation:
            assignment = Stress.NON_VOWEL
            for symbol in phone:
                if symbol == "0":
                    assignment = Stress.NO_STRESS
                    break
                elif symbol == "1":
                    assignment = Stress.PRIMARY
                    break
                elif symbol == "2":
                    assignment = Stress.SECONDARY
                    break
            stress_pattern.append(assignment)
        assert len(stress_pattern) == len(
            pronunciation
        ), "each phone must have a stress assignment (non-vowels should be assigned Stress.NON_VOWEL)"
        return stress_pattern

    @staticmethod
    def from_cmu_dict(
        filepath: Optional[str] = None, converter: ConverterFunc = identity
    ) -> "CMUPronouncingDict":
        cmudict_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "resources", "cmudict"
        )
        filepath = filepath or cmudict_file
        pairs = []
        with open(filepath, "r", encoding="ISO-8859-1") as infile:
            for row in infile:
                # ignore comments
                if not row.startswith(";;"):
                    res = row.strip().split("  ")
                    # discard (x) for any entries with multiple pronunciations
                    key = res[0].lower()
                    # permit(1) -> permit
                    # permit -> permit
                    key = key.split("(", -1)[0] if key.endswith(")") else key
                    # "{LEFT-BRACE" -> "{"
                    key = key if key[0].isalnum() else key[0]
                    pronunciation = "".join(res[1:]).split(" ")
                    # ARPAbet pronunciation.
                    # optionally convert to provided format
                    value = tuple(converter(phon) for phon in pronunciation)
                    pairs.append((key, value))
        return CMUPronouncingDict(pairs)

Functions

def identity(x) ‑> Callable[[str], str]
Expand source code
identity: Callable[[Text], Text] = lambda x: x

Classes

class CMUPronouncingDict (pairs: List[Tuple[str, Tuple[str, ...]]] = [])

Maps tuples of pronunciations -> lexical entries

Expand source code
class CMUPronouncingDict(PronouncingDict):
    def __init__(self, pairs: List[Tuple[SimpleWord, Pronunciation]] = []):
        super().__init__(pairs)

    def stress_for(self, pronunciation: Pronunciation) -> Sequence[Stress]:
        """Returns the stress assignment for each phone in the pronunciation"""
        stress_pattern = []
        for phone in pronunciation:
            assignment = Stress.NON_VOWEL
            for symbol in phone:
                if symbol == "0":
                    assignment = Stress.NO_STRESS
                    break
                elif symbol == "1":
                    assignment = Stress.PRIMARY
                    break
                elif symbol == "2":
                    assignment = Stress.SECONDARY
                    break
            stress_pattern.append(assignment)
        assert len(stress_pattern) == len(
            pronunciation
        ), "each phone must have a stress assignment (non-vowels should be assigned Stress.NON_VOWEL)"
        return stress_pattern

    @staticmethod
    def from_cmu_dict(
        filepath: Optional[str] = None, converter: ConverterFunc = identity
    ) -> "CMUPronouncingDict":
        cmudict_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "resources", "cmudict"
        )
        filepath = filepath or cmudict_file
        pairs = []
        with open(filepath, "r", encoding="ISO-8859-1") as infile:
            for row in infile:
                # ignore comments
                if not row.startswith(";;"):
                    res = row.strip().split("  ")
                    # discard (x) for any entries with multiple pronunciations
                    key = res[0].lower()
                    # permit(1) -> permit
                    # permit -> permit
                    key = key.split("(", -1)[0] if key.endswith(")") else key
                    # "{LEFT-BRACE" -> "{"
                    key = key if key[0].isalnum() else key[0]
                    pronunciation = "".join(res[1:]).split(" ")
                    # ARPAbet pronunciation.
                    # optionally convert to provided format
                    value = tuple(converter(phon) for phon in pronunciation)
                    pairs.append((key, value))
        return CMUPronouncingDict(pairs)

Ancestors

Static methods

def from_cmu_dict(filepath: Union[str, NoneType] = None, converter: Callable[[str], str] = <function <lambda>>) ‑> CMUPronouncingDict
Expand source code
@staticmethod
def from_cmu_dict(
    filepath: Optional[str] = None, converter: ConverterFunc = identity
) -> "CMUPronouncingDict":
    cmudict_file = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), "resources", "cmudict"
    )
    filepath = filepath or cmudict_file
    pairs = []
    with open(filepath, "r", encoding="ISO-8859-1") as infile:
        for row in infile:
            # ignore comments
            if not row.startswith(";;"):
                res = row.strip().split("  ")
                # discard (x) for any entries with multiple pronunciations
                key = res[0].lower()
                # permit(1) -> permit
                # permit -> permit
                key = key.split("(", -1)[0] if key.endswith(")") else key
                # "{LEFT-BRACE" -> "{"
                key = key if key[0].isalnum() else key[0]
                pronunciation = "".join(res[1:]).split(" ")
                # ARPAbet pronunciation.
                # optionally convert to provided format
                value = tuple(converter(phon) for phon in pronunciation)
                pairs.append((key, value))
    return CMUPronouncingDict(pairs)

Methods

def stress_for(self, pronunciation: Tuple[str, ...]) ‑> Sequence[Stress]

Returns the stress assignment for each phone in the pronunciation

Expand source code
def stress_for(self, pronunciation: Pronunciation) -> Sequence[Stress]:
    """Returns the stress assignment for each phone in the pronunciation"""
    stress_pattern = []
    for phone in pronunciation:
        assignment = Stress.NON_VOWEL
        for symbol in phone:
            if symbol == "0":
                assignment = Stress.NO_STRESS
                break
            elif symbol == "1":
                assignment = Stress.PRIMARY
                break
            elif symbol == "2":
                assignment = Stress.SECONDARY
                break
        stress_pattern.append(assignment)
    assert len(stress_pattern) == len(
        pronunciation
    ), "each phone must have a stress assignment (non-vowels should be assigned Stress.NON_VOWEL)"
    return stress_pattern

Inherited members

class ConverterUtils

Converter utilities to map between phonological symbol sets

Expand source code
class ConverterUtils:

    """Converter utilities to map between phonological symbol sets"""

    # FIXME: add more symbols
    # TODO: test cases
    arpabet_to_ipa_dict: Dict[Text, Text] = {
        "AA": "ɒ",
        "AE": "æ",
        # FIXME: this should translate to "ʌ" OR "ə" depending on the stress assignment
        "AH": "ʌ",
        "AO": "ɔ",
        "AW": "aʊ",
        "AY": "ai",
        "B": "b",
        "CH": "tʃ",
        "D": "d",
        "DH": "ð",
        "EH": "ɛ",
        "ER": "ə",
        "EY": "ei",
        "F": "f",
        "G": "g",
        "HH": "h",
        "IH": "i",
        "IY": "I",
        "JH": "dʒ",
        "K": "k",
        "L": "l",
        "M": "m",
        "N": "n",
        "NG": "ŋ",
        "OW": "oʊ",
        "OY": "ɔi",
        "P": "p",
        "R": "ɹ",
        "S": "s",
        "SH": "ʃ",
        "T": "t",
        "TH": "θ",
        "UH": "ʊ",
        "UW": "U",
        "V": "v",
        "W": "w",
        "Y": "j",
        "Z": "z",
        "ZH": "ʒ",
    }
    """Dictionary mapping arpabet symbols to IPA"""

    # reverse mapping
    # ipa_to_arpabet_dict = {v: k for k, v in arpabet_to_ipa.items()}

    def arpabet_to_ipa(symbol: Text) -> Text:
        """Converts an Arpabet symbol to IPA

        Example:
        from clu.phontools.pronouncing.ConverterUtils
        bell_arpa = ('B', 'EH1', 'L')
        bell_ipa = tuple(ConverterUtils.arpabet_to_ipa(symb) for symb in bell_arpa)
        # should produce ('b', 'ɛ', 'l')
        """
        stress: Optional[Text] = (
            None
            if symbol[-1]
            not in {
                Stress.PRIMARY.value,
                Stress.SECONDARY.value,
                Stress.NO_STRESS.value,
            }
            else symbol[-1]
        )
        base_form: Text = symbol if not stress else symbol[:-1]
        return ConverterUtils.arpabet_to_ipa_dict.get(base_form, base_form)

    def ipa_to_arpabet(symbol: Text) -> Text:
        """Converts an Arpabet symbol to IPA

        Example:
        from clu.phontools.pronouncing.ConverterUtils
        bell_ipa = ('b', 'ɛ', 'l')
        bell_arpa = tuple(ConverterUtils.ipa_to_arpabet(symb) for symb in bell_ipa)
        # should produce ('B', 'EH', 'L')
        """
        for (arpa, ipa) in ConverterUtils.arpabet_to_ipa_dict.items():
            if ipa == symbol:
                return arpa
        # in case of failure, parrot back symbol
        return symbol

Class variables

var arpabet_to_ipa_dict : Dict[str, str]

Dictionary mapping arpabet symbols to IPA

Methods

def arpabet_to_ipa(symbol: str) ‑> str

Converts an Arpabet symbol to IPA

Example: from clu.phontools.pronouncing.ConverterUtils bell_arpa = ('B', 'EH1', 'L') bell_ipa = tuple(ConverterUtils.arpabet_to_ipa(symb) for symb in bell_arpa)

should produce ('b', 'ɛ', 'l')

Expand source code
def arpabet_to_ipa(symbol: Text) -> Text:
    """Converts an Arpabet symbol to IPA

    Example:
    from clu.phontools.pronouncing.ConverterUtils
    bell_arpa = ('B', 'EH1', 'L')
    bell_ipa = tuple(ConverterUtils.arpabet_to_ipa(symb) for symb in bell_arpa)
    # should produce ('b', 'ɛ', 'l')
    """
    stress: Optional[Text] = (
        None
        if symbol[-1]
        not in {
            Stress.PRIMARY.value,
            Stress.SECONDARY.value,
            Stress.NO_STRESS.value,
        }
        else symbol[-1]
    )
    base_form: Text = symbol if not stress else symbol[:-1]
    return ConverterUtils.arpabet_to_ipa_dict.get(base_form, base_form)
def ipa_to_arpabet(symbol: str) ‑> str

Converts an Arpabet symbol to IPA

Example: from clu.phontools.pronouncing.ConverterUtils bell_ipa = ('b', 'ɛ', 'l') bell_arpa = tuple(ConverterUtils.ipa_to_arpabet(symb) for symb in bell_ipa)

should produce ('B', 'EH', 'L')

Expand source code
def ipa_to_arpabet(symbol: Text) -> Text:
    """Converts an Arpabet symbol to IPA

    Example:
    from clu.phontools.pronouncing.ConverterUtils
    bell_ipa = ('b', 'ɛ', 'l')
    bell_arpa = tuple(ConverterUtils.ipa_to_arpabet(symb) for symb in bell_ipa)
    # should produce ('B', 'EH', 'L')
    """
    for (arpa, ipa) in ConverterUtils.arpabet_to_ipa_dict.items():
        if ipa == symbol:
            return arpa
    # in case of failure, parrot back symbol
    return symbol
class PronouncingDict (pairs: List[Tuple[str, Tuple[str, ...]]] = [])

Maps tuples of pronunciations -> lexical entries

Expand source code
class PronouncingDict(dict, ABC):
    """
    Maps tuples of pronunciations -> lexical entries
    """

    def __init__(self, pairs: List[Tuple[SimpleWord, Pronunciation]] = []):
        self._dict: Dict[Word, List[Pronunciation]] = self._generate_dict(pairs)

    @abstractmethod
    def stress_for(self, pronunciation: Pronunciation) -> Sequence[Stress]:
        """Returns the stress assignment for each phone in the pronunciation

        Subclasses of `clu.phontools.pronouncing.PronouncingDict` should implement `clu.phontools.pronouncing.PronouncingDict.stress_for`
        """
        pass

    def _preprocess_key(self, key: SimpleWord) -> SimpleWord:
        return key.lower()

    def keys(self):
        return self._dict.keys()

    def values(self):
        return self._dict.values()

    def items(self):
        return self._dict.items()

    def __iter__(self):
        return iter(self._dict)

    def __len__(self):
        return len(self._dict)

    def get(self, key: Text) -> SimpleWord:
        return self._dict.get(self._preprocess_key(key), [])

    def add(self, key: SimpleWord, value: Pronunciation) -> None:
        """Adds a pronunciation for the given word"""
        clean_key: SimpleWord = self._preprocess_key(key)
        old: List[Pronunciation] = self._dict.get(clean_key, [])
        self._dict[clean_key] = old + [value]

    def __getitem__(self, key: SimpleWord) -> List[Pronunciation]:
        return self._dict.__getitem__(key)

    def __setitem__(self, key: SimpleWord, value: List[Pronunciation]) -> None:
        self._dict.__setitem__(self, self._preprocess_key(key), value)

    def _generate_dict(
        self, pairs: List[Tuple[SimpleWord, Pronunciation]]
    ) -> Dict[SimpleWord, List[Pronunciation]]:
        pronounciation_dict = dict()
        for (k, v) in pairs:
            key = self._preprocess_key(k)
            pronunciations = pronounciation_dict.get(key, [])
            pronunciations.append(v)
            pronounciation_dict[key] = pronunciations
        return pronounciation_dict

Ancestors

  • builtins.dict
  • abc.ABC

Subclasses

Methods

def add(self, key: str, value: Tuple[str, ...]) ‑> NoneType

Adds a pronunciation for the given word

Expand source code
def add(self, key: SimpleWord, value: Pronunciation) -> None:
    """Adds a pronunciation for the given word"""
    clean_key: SimpleWord = self._preprocess_key(key)
    old: List[Pronunciation] = self._dict.get(clean_key, [])
    self._dict[clean_key] = old + [value]
def get(self, key: str) ‑> str

Return the value for key if key is in the dictionary, else default.

Expand source code
def get(self, key: Text) -> SimpleWord:
    return self._dict.get(self._preprocess_key(key), [])
def items(self)

D.items() -> a set-like object providing a view on D's items

Expand source code
def items(self):
    return self._dict.items()
def keys(self)

D.keys() -> a set-like object providing a view on D's keys

Expand source code
def keys(self):
    return self._dict.keys()
def stress_for(self, pronunciation: Tuple[str, ...]) ‑> Sequence[Stress]

Returns the stress assignment for each phone in the pronunciation

Subclasses of PronouncingDict should implement PronouncingDict.stress_for()

Expand source code
@abstractmethod
def stress_for(self, pronunciation: Pronunciation) -> Sequence[Stress]:
    """Returns the stress assignment for each phone in the pronunciation

    Subclasses of `clu.phontools.pronouncing.PronouncingDict` should implement `clu.phontools.pronouncing.PronouncingDict.stress_for`
    """
    pass
def values(self)

D.values() -> an object providing a view on D's values

Expand source code
def values(self):
    return self._dict.values()