Source code for narmer.phonetic

# -*- coding: utf-8 -*-

# Copyright 2015-2018 by Christopher C. Little.
# This file is part of Narmer.
#
# Narmer is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Narmer is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Narmer. If not, see <http://www.gnu.org/licenses/>.

"""narmer.phonetic.

The phonetic module implements phonetic algorithms including:

    - german_ipa
"""

from __future__ import division, unicode_literals

import unicodedata

from six import text_type
from six.moves import range


[docs]def german_ipa(word, period='nhg'): """Convert German to IPA. Wrapper for other, more specific functions to convert German of various periods to IPA. :param str word: the German word to transcribe to IPA :param str period: a period of German from the set: - nhg (default) -- New High German - enhg -- Early New High German - mhg -- Middle High German - ohg -- Old High German :returns: the German word's approximate IPA equivalent :rtype: str >>> german_ipa('Ehre') 'ere' >>> german_ipa('Kohl') 'kol' >>> german_ipa('Schifffahrt') 'ʃifffart' >>> german_ipa('Schiller') 'ʃiller' >>> german_ipa('Tschechien') 'tʃeçin' """ period_map = {'nhg': nhg_ipa, 'enhg': enhg_ipa, 'mhg': mhg_ipa, 'ohg': ohg_ipa} period = period.lower() if period not in period_map: raise ValueError('Value of period must be one of ' + ', '.join(period_map.keys())) else: return period_map[period](word)
[docs]def nhg_ipa(word): """Convert New High German to IPA. This is based largely on the orthographic mapping described at: https://en.wikipedia.org/wiki/German_orthography No significant attempt is made to accommodate loanwords. :param str word: the NHG word to transcribe to IPA :returns: the NHG word's approximate IPA equivalent :rtype: str >>> nhg_ipa('Ehre') 'ere' >>> nhg_ipa('Kohl') 'kol' >>> nhg_ipa('Schifffahrt') 'ʃifffart' >>> nhg_ipa('Schiller') 'ʃiller' >>> nhg_ipa('Tschechien') 'tʃeçin' """ # pylint: disable=too-many-branches _vowels = frozenset('AEIOUYÄÖÜ') word = unicodedata.normalize('NFKC', text_type(word.upper())) word = word.replace('ß', 'SS') # word = ''.join([c for c in word if c in # frozenset('ABCDEFGIKLMNOPQRSTUVXYZ')]) ipa = '' last = len(word)-1 skip = 0 for i in range(len(word)): if skip: skip -= 1 continue # Consonants if word[i] in frozenset('BFJKLMR'): ipa += word[i].lower() elif word[i] == 'C': if word[i:i+2] == 'CH': if word[i:i+3] == 'CHS': ipa += 'ks' skip = 2 elif word[i:i+4] == 'CHEN': ipa += 'ç' skip = 1 elif i-1 >= 0 and word[i-1] in frozenset('AOU'): ipa += 'x' skip = 1 else: ipa += 'ç' skip = 1 elif word[i:i+2] == 'CK': ipa += 'k' skip = 1 elif i != last and word[i+1] in frozenset('ÄEI'): ipa += 'ts' else: ipa += 'k' elif word[i] == 'D': if word[i:i+4] == 'DSCH': ipa += 'dʒ' skip = 3 elif word[i:i+2] == 'DT': ipa += 't' skip = 1 else: ipa += 'd' elif word[i] == 'G': if i-1 >= 0 and word[i-1] == 'I': ipa += 'ç' else: ipa += 'g' elif word[i] == 'H': # H after vowels should already be covered by the vowel rules ipa += 'h' elif word[i] == 'N': if word[i:i+2] == 'NG': ipa += 'ŋ' skip = 1 elif word[i:i+2] == 'NK': ipa += 'ŋk' skip = 1 else: ipa += 'n' elif word[i] == 'P': if word[i:i+2] == 'PH': ipa += 'f' skip = 1 else: ipa += 'p' elif word[i] == 'Q': if word[i:i+2] == 'QU' and i+1 != last and word[i+2] in _vowels: ipa += 'kv' skip = 1 else: ipa += 'k' elif word[i] == 'S': if word[i:i+2] == 'SS': ipa += 's' skip = 1 elif word[i:i+3] == 'SCH': ipa += 'ʃ' skip = 2 elif i == 0 and i != last and word[i+1] in frozenset('PT'): ipa += 'ʃ' elif i != last and word[i+1] in _vowels: ipa += 'z' else: ipa += 's' elif word[i] == 'T': if word[i:i+4] == 'TSCH': ipa += 'tʃ' skip = 3 elif word[i:i+5] == 'TZSCH': ipa += 'tʃ' skip = 4 elif (word[i:i+4] == 'TION' or word[i:i+4] == 'TIÄR' or word[i:i+4] == 'TIAL' or word[i:i+5] == 'TIELL'): ipa += 'tsi' skip = 1 elif word[i:i+2] == 'TZ': ipa += 'ts' skip = 1 elif word[i:i+2] == 'TH': ipa += 't' skip = 1 else: ipa += 't' elif word[i] == 'V': ipa += 'f' elif word[i] == 'W': ipa += 'v' elif word[i] == 'X': ipa += 'ks' elif word[i] == 'Z': if word[i:i+4] == 'ZSCH': ipa += 'tʃ' skip = 3 else: ipa += 'ts' # Vowels -- little attention is paid to length or tenseness # -Diphthongs first elif word[i:i+2] in frozenset(['EI', 'AI', 'EY', 'AY']): ipa += 'ai' skip = 1 elif word[i:i+2] in frozenset(['EU', 'ÄU']): ipa += 'øy' skip = 1 elif word[i:i+2] == 'AU': ipa += 'au' skip = 1 # -Monophthongs following elif word[i] == 'A': if word[i:i+2] in frozenset(['AA', 'AH']): skip = 1 ipa += 'a' elif word[i] == 'E': if word[i:i+2] in frozenset(['EE', 'EH']): skip = 1 ipa += 'e' elif word[i] == 'I': if word[i:i+2] in frozenset(['IE', 'IH']): skip = 1 if word[i:i+3] == 'IEH': skip = 2 ipa += 'i' elif word[i] == 'O': if word[i:i+2] in frozenset(['OO', 'OH']): skip = 1 ipa += 'o' elif word[i] == 'U': if word[i:i+2] == 'UH': skip = 1 ipa += 'u' elif word[i] == 'Y': ipa += 'y' elif word[i] == 'Ä': if word[i:i+2] == 'ÄH': skip = 1 ipa += 'e' elif word[i] == 'Ö': if word[i:i+2] == 'ÖH': skip = 1 ipa += 'ø' elif word[i] == 'Ü': if word[i:i+2] == 'ÜH': skip = 1 ipa += 'y' return ipa
[docs]def enhg_ipa(word): """Convert Early New High German to IPA. This is based on TODO :param str word: the ENHG word to transcribe to IPA :returns: the ENHG word's approximate IPA equivalent :rtype: str """ # pylint: disable=too-many-branches _vowels = frozenset('AEIOUYÄÖÜ') word = unicodedata.normalize('NFKC', text_type(word.upper())) word = word.replace('ß', 'SS') # word = ''.join([c for c in word if c in # frozenset('ABCDEFGIKLMNOPQRSTUVXYZ')]) ipa = '' last = len(word)-1 skip = 0 for i in range(len(word)): if skip: skip -= 1 continue # Consonants if word[i] in frozenset('BFJKLMR'): ipa += word[i].lower() elif word[i] == 'C': if word[i:i+2] == 'CH': if word[i:i+3] == 'CHS': ipa += 'ks' skip = 2 elif word[i:i+4] == 'CHEN': ipa += 'ç' skip = 1 elif i-1 >= 0 and word[i-1] in frozenset('AOU'): ipa += 'x' skip = 1 else: ipa += 'ç' skip = 1 elif word[i:i+2] == 'CK': ipa += 'k' skip = 1 elif i != last and word[i+1] in frozenset('ÄEI'): ipa += 'ts' else: ipa += 'k' elif word[i] == 'D': if word[i:i+4] == 'DSCH': ipa += 'dʒ' skip = 3 elif word[i:i+2] == 'DT': ipa += 't' skip = 1 else: ipa += 'd' elif word[i] == 'G': if i-1 >= 0 and word[i-1] == 'I': ipa += 'ç' else: ipa += 'g' elif word[i] == 'H': # H after vowels should already be covered by the vowel rules ipa += 'h' elif word[i] == 'N': if word[i:i+2] == 'NG': ipa += 'ŋ' skip = 1 elif word[i:i+2] == 'NK': ipa += 'ŋk' skip = 1 else: ipa += 'n' elif word[i] == 'P': if word[i:i+2] == 'PH': ipa += 'f' skip = 1 else: ipa += 'p' elif word[i] == 'Q': if word[i:i+2] == 'QU' and i+1 != last and word[i+2] in _vowels: ipa += 'kv' skip = 1 else: ipa += 'k' elif word[i] == 'S': if word[i:i+2] == 'SS': ipa += 's' skip = 1 elif word[i:i+3] == 'SCH': ipa += 'ʃ' skip = 2 elif i == 0 and i != last and word[i+1] in frozenset('PT'): ipa += 'ʃ' elif i != last and word[i+1] in _vowels: ipa += 'z' else: ipa += 's' elif word[i] == 'T': if word[i:i+4] == 'TSCH': ipa += 'tʃ' skip = 3 elif word[i:i+5] == 'TZSCH': ipa += 'tʃ' skip = 4 elif (word[i:i+4] == 'TION' or word[i:i+4] == 'TIÄR' or word[i:i+4] == 'TIAL' or word[i:i+5] == 'TIELL'): ipa += 'tsi' skip = 1 elif word[i:i+2] == 'TZ': ipa += 'ts' skip = 1 elif word[i:i+2] == 'TH': ipa += 't' skip = 1 else: ipa += 't' elif word[i] == 'V': ipa += 'f' elif word[i] == 'W': ipa += 'v' elif word[i] == 'X': ipa += 'ks' elif word[i] == 'Z': if word[i:i+4] == 'ZSCH': ipa += 'tʃ' skip = 3 else: ipa += 'ts' # Vowels -- little attention is paid to length or tenseness # -Diphthongs first elif word[i:i+2] in frozenset(['EI', 'AI', 'EY', 'AY']): ipa += 'ai' skip = 1 elif word[i:i+2] in frozenset(['EU', 'ÄU']): ipa += 'øy' skip = 1 elif word[i:i+2] == 'AU': ipa += 'au' skip = 1 # -Monophthongs following elif word[i] == 'A': if word[i:i+2] in frozenset(['AA', 'AH']): skip = 1 ipa += 'a' elif word[i] == 'E': if word[i:i+2] in frozenset(['EE', 'EH']): skip = 1 ipa += 'e' elif word[i] == 'I': if word[i:i+2] in frozenset(['IE', 'IH']): skip = 1 if word[i:i+3] == 'IEH': skip = 2 ipa += 'i' elif word[i] == 'O': if word[i:i+2] in frozenset(['OO', 'OH']): skip = 1 ipa += 'o' elif word[i] == 'U': if word[i:i+2] == 'UH': skip = 1 ipa += 'u' elif word[i] == 'Y': ipa += 'y' elif word[i] == 'Ä': if word[i:i+2] == 'ÄH': skip = 1 ipa += 'e' elif word[i] == 'Ö': if word[i:i+2] == 'ÖH': skip = 1 ipa += 'ø' elif word[i] == 'Ü': if word[i:i+2] == 'ÜH': skip = 1 ipa += 'y' return ipa
[docs]def mhg_ipa(word): """Convert Middle High German to IPA. This is based on http://users.clas.ufl.edu/hasty/resources/CHAPTER1.HTM :param str word: the ENHG word to transcribe to IPA :returns: the ENHG word's approximate IPA equivalent :rtype: str """ # pylint: disable=too-many-branches _vowels = frozenset('AEIOUYÄÖÜÆŒĀĒĪŌŪË') word = unicodedata.normalize('NFKC', text_type(word.upper())) word = word.replace('ß', 'SS') for ch_from, ch_to in zip('ÂÊÎÔÛ', 'ĀĒĪŌŪ'): word = word.replace(ch_from, ch_to) # word = ''.join([c for c in word if c in # frozenset('ABCDEFGIKLMNOPQRSTUVXYZ')]) ipa = '' last = len(word)-1 skip = 0 for i in range(len(word)): if skip: skip -= 1 continue # Consonants if word[i] in frozenset('BFJKLMRW'): ipa += word[i].lower() elif word[i] == 'C': if word[i:i+2] == 'CH': ipa += 'x' skip = 1 elif word[i:i+2] == 'CK': ipa += 'k' skip = 1 else: ipa += 'k' elif word[i] == 'D': if word[i:i+4] == 'DSCH': ipa += 'dʒ' skip = 3 elif word[i:i+2] == 'DT': ipa += 't' skip = 1 else: ipa += 'd' elif word[i] == 'G': if i-1 >= 0 and word[i-1] == 'I': ipa += 'g' else: ipa += 'g' elif word[i] == 'H': if word[i-1:i] in _vowels and word[i+1:i+2] in _vowels: ipa += 'h' else: ipa += 'x' elif word[i] == 'N': if word[i:i+2] == 'NG': ipa += 'ŋg' skip = 1 elif word[i:i+2] == 'NK': ipa += 'ŋk' skip = 1 else: ipa += 'n' elif word[i] == 'P': if word[i:i+2] == 'PH': ipa += 'pf' skip = 1 else: ipa += 'p' elif word[i] == 'Q': if word[i:i+2] == 'QU' and i+1 != last and word[i+2] in _vowels: ipa += 'kv' skip = 1 else: ipa += 'k' elif word[i] == 'S': if word[i:i+2] == 'SS': ipa += 's' skip = 1 elif word[i:i+3] == 'SCH': ipa += 'ʃ' skip = 2 elif word[i:i+2] in frozenset(['SC', 'SK']): ipa += 'ʃ' skip = 1 elif i == 0 and i != last and word[i+1] in frozenset('PT'): ipa += 'ʃ' elif i != last and word[i+1] in _vowels: ipa += 'z' else: ipa += 's' elif word[i] == 'T': if word[i:i+4] == 'TSCH': ipa += 'tʃ' skip = 3 elif word[i:i+5] == 'TZSCH': ipa += 'tʃ' skip = 4 elif (word[i:i+4] == 'TION' or word[i:i+4] == 'TIÄR' or word[i:i+4] == 'TIAL' or word[i:i+5] == 'TIELL'): ipa += 'tsi' skip = 1 elif word[i:i+2] == 'TZ': ipa += 'ts' skip = 1 elif word[i:i+2] == 'TH': ipa += 't' skip = 1 else: ipa += 't' elif word[i] == 'V': ipa += 'f' elif word[i] == 'X': ipa += 'ks' elif word[i] == 'Z': if word[i:i+4] == 'ZSCH': ipa += 'tʃ' skip = 3 else: ipa += 'ts' # Vowels -- little attention is paid to length or tenseness # -Diphthongs first elif word[i:i+2] in frozenset(['EI', 'EY']): ipa += 'ei' skip = 1 elif word[i:i+2] in frozenset(['AI', 'AY']): ipa += 'ai' skip = 1 elif word[i:i+2] == 'IE': ipa += 'ie' skip = 1 elif word[i:i+2] == 'AU': ipa += 'au' skip = 1 elif word[i:i+2] == 'ÜE': ipa += 'yu' skip = 1 elif word[i:i+2] in frozenset(['ÖU', 'EU', 'OI']): ipa += 'øy' skip = 1 # -Monophthongs following elif word[i:i+2] == 'IU': skip = 1 ipa += 'yː' elif word[i:i+2] == 'AE': skip = 1 ipa += 'ɛː' elif word[i:i+2] == 'OE': skip = 1 ipa += 'øː' elif word[i] == 'Ā': ipa += 'aː' elif word[i:i+2] == 'AA': skip = 1 ipa += 'aː' elif word[i] == 'A': ipa += 'a' elif word[i] == 'Ē': ipa += 'eː' elif word[i:i+2] == 'EE': skip = 1 ipa += 'eː' elif word[i] == 'E': ipa += 'e' elif word[i] == 'Ī': ipa += 'iː' elif word[i:i+2] == 'II': skip = 1 ipa += 'iː' elif word[i] == 'I': ipa += 'i' elif word[i] == 'Ō': ipa += 'oː' elif word[i:i+2] == 'OO': skip = 1 ipa += 'oː' elif word[i] == 'O': ipa += 'o' elif word[i] == 'Ū': ipa += 'uː' elif word[i:i+2] == 'UU': skip = 1 ipa += 'uː' elif word[i] == 'U': ipa += 'u' elif word[i] == 'Y': ipa += 'y' elif word[i] == 'Æ': ipa += 'ɛː' elif word[i] == 'Ä': ipa += 'ɛ' elif word[i] == 'Œ': ipa += 'øː' elif word[i] == 'Ö': ipa += 'ø' elif word[i] == 'Ü': ipa += 'y' elif word[i] == 'Ë': ipa += 'ɛ' return ipa
[docs]def ohg_ipa(word): """Convert Old High German to IPA. This is based on TODO :param str word: the ENHG word to transcribe to IPA :returns: the ENHG word's approximate IPA equivalent :rtype: str """ # pylint: disable=too-many-branches _vowels = frozenset('AEIOUĀĒĪŌŪË') word = unicodedata.normalize('NFKC', text_type(word.upper())) word = word.replace('ß', 'SS') for ch_from, ch_to in zip('ÂÊÎÔÛ', 'ĀĒĪŌŪ'): word = word.replace(ch_from, ch_to) # word = ''.join([c for c in word if c in # frozenset('ABCDEFGIKLMNOPQRSTUVXYZ')]) ipa = '' last = len(word)-1 skip = 0 for i in range(len(word)): if skip: skip -= 1 continue # Consonants if word[i] in frozenset('BFJKLMR'): ipa += word[i].lower() elif word[i] == 'C': if word[i:i+2] == 'CH': if word[i:i+3] == 'CHS': ipa += 'ks' skip = 2 elif word[i:i+4] == 'CHEN': ipa += 'ç' skip = 1 elif i-1 >= 0 and word[i-1] in frozenset('AOU'): ipa += 'x' skip = 1 else: ipa += 'ç' skip = 1 elif word[i:i+2] == 'CK': ipa += 'k' skip = 1 elif i != last and word[i+1] in frozenset('ÄEI'): ipa += 'ts' else: ipa += 'k' elif word[i] == 'D': if word[i:i+4] == 'DSCH': ipa += 'dʒ' skip = 3 elif word[i:i+2] == 'DT': ipa += 't' skip = 1 else: ipa += 'd' elif word[i] == 'G': if i-1 >= 0 and word[i-1] == 'I': ipa += 'ç' else: ipa += 'g' elif word[i] == 'H': # H after vowels should already be covered by the vowel rules ipa += 'h' elif word[i] == 'N': if word[i:i+2] == 'NG': ipa += 'ŋ' skip = 1 elif word[i:i+2] == 'NK': ipa += 'ŋk' skip = 1 else: ipa += 'n' elif word[i] == 'P': if word[i:i+2] == 'PH': ipa += 'f' skip = 1 else: ipa += 'p' elif word[i] == 'Q': if word[i:i+2] == 'QU' and i+1 != last and word[i+2] in _vowels: ipa += 'kv' skip = 1 else: ipa += 'k' elif word[i] == 'S': if word[i:i+2] == 'SS': ipa += 's' skip = 1 elif word[i:i+3] == 'SCH': ipa += 'ʃ' skip = 2 elif i == 0 and i != last and word[i+1] in frozenset('PT'): ipa += 'ʃ' elif i != last and word[i+1] in _vowels: ipa += 'z' else: ipa += 's' elif word[i] == 'T': if word[i:i+4] == 'TSCH': ipa += 'tʃ' skip = 3 elif word[i:i+5] == 'TZSCH': ipa += 'tʃ' skip = 4 elif (word[i:i+4] == 'TION' or word[i:i+4] == 'TIÄR' or word[i:i+4] == 'TIAL' or word[i:i+5] == 'TIELL'): ipa += 'tsi' skip = 1 elif word[i:i+2] == 'TZ': ipa += 'ts' skip = 1 elif word[i:i+2] == 'TH': ipa += 't' skip = 1 else: ipa += 't' elif word[i] == 'V': ipa += 'f' elif word[i] == 'W': ipa += 'v' elif word[i] == 'X': ipa += 'ks' elif word[i] == 'Z': if word[i:i+4] == 'ZSCH': ipa += 'tʃ' skip = 3 else: ipa += 'ts' # Vowels -- little attention is paid to length or tenseness # -Diphthongs first elif word[i:i+2] in frozenset(['EI', 'AI', 'EY', 'AY']): ipa += 'ai' skip = 1 elif word[i:i+2] in frozenset(['EU', 'ÄU']): ipa += 'øy' skip = 1 elif word[i:i+2] == 'AU': ipa += 'au' skip = 1 # -Monophthongs following elif word[i] == 'A': if word[i:i+2] in frozenset(['AA', 'AH']): skip = 1 ipa += 'a' elif word[i] == 'E': if word[i:i+2] in frozenset(['EE', 'EH']): skip = 1 ipa += 'e' elif word[i] == 'I': if word[i:i+2] in frozenset(['IE', 'IH']): skip = 1 if word[i:i+3] == 'IEH': skip = 2 ipa += 'i' elif word[i] == 'O': if word[i:i+2] in frozenset(['OO', 'OH']): skip = 1 ipa += 'o' elif word[i] == 'U': if word[i:i+2] == 'UH': skip = 1 ipa += 'u' elif word[i] == 'Y': ipa += 'y' elif word[i] == 'Ä': if word[i:i+2] == 'ÄH': skip = 1 ipa += 'e' elif word[i] == 'Ö': if word[i:i+2] == 'ÖH': skip = 1 ipa += 'ø' elif word[i] == 'Ü': if word[i:i+2] == 'ÜH': skip = 1 ipa += 'y' return ipa