Source code for indic_transliteration.sanscript

# -*- coding: utf-8 -*-
"""
Transliteration functions for Sanskrit. The most important function is
:func:`transliterate`, which is very easy to use::

    output = transliterate(data, IAST, DEVANAGARI)

By default, the module supports the following scripts:

- Bengali
- Devanagari
- Gujarati
- Kannada
- Malayalam
- Telugu
- Tamil
- Oriya
- Gurmukhi/ Punjabi/ Panjabi

and the following romanizations:

- HK = 'hk'
- IAST = 'iast'
- ITRANS = 'itrans'
- OPTITRANS = 'optitrans'
- KOLKATA = 'kolkata'
- SLP1 = 'slp1'
- VELTHUIS = 'velthuis'
- WX = 'wx'

Each of these **schemes** is defined in a global dictionary `SCHEMES`, whose
keys are strings::

    devanagari_scheme = SCHEMES['devanagari']

For convenience, we also define a variable for each scheme::

    devanagari_scheme = SCHEMES[DEVANAGARI]

These variables are documented below.

:license: MIT and BSD

.. _Bengali: http://en.wikipedia.org/wiki/Bengali_alphabet
.. _Devanagari: http://en.wikipedia.org/wiki/Devanagari
.. _Gujarati: http://en.wikipedia.org/wiki/Gujarati_alphabet
.. _Kannada: http://en.wikipedia.org/wiki/Kannada_alphabet
.. _Malayalam: http://en.wikipedia.org/wiki/Malayalam_alphabet
.. _Telugu: http://en.wikipedia.org/wiki/Telugu_alphabet

.. _Harvard-Kyoto: http://en.wikipedia.org/wiki/Harvard-Kyoto
.. _IAST: http://en.wikipedia.org/wiki/IAST
"""

from __future__ import unicode_literals

# Brahmic schemes
# ---------------
#: Internal name of Bengali. Bengali ``ba`` and ``va`` are both rendered
#: as `ব`.
import sys
from indic_transliteration.sanscript import schemes
from indic_transliteration.sanscript.schemes import Scheme
from indic_transliteration.sanscript.schemes import roman
from indic_transliteration.sanscript.schemes import brahmi

try:
    from functools import lru_cache
except ImportError:
    from backports.functools_lru_cache import lru_cache

# These variables are replicated here for backward compatibility.
# -------------
BENGALI = brahmi.BENGALI
DEVANAGARI = brahmi.DEVANAGARI
GUJARATI = brahmi.GUJARATI
GURMUKHI = brahmi.GURMUKHI
KANNADA = brahmi.KANNADA
MALAYALAM = brahmi.MALAYALAM
ORIYA = brahmi.ORIYA
TAMIL = brahmi.TAMIL
TELUGU = brahmi.TELUGU
HK = roman.HK
IAST = roman.IAST
ITRANS = roman.ITRANS
OPTITRANS = roman.OPTITRANS
KOLKATA = roman.KOLKATA
SLP1 = roman.SLP1
VELTHUIS = roman.VELTHUIS
WX = roman.WX

SCHEMES = {}


[docs]class SchemeMap(object): """Maps one :class:`Scheme` to another. This class grabs the metadata and character data required for :func:`transliterate`. :param from_scheme: the source scheme :param to_scheme: the destination scheme """ def __init__(self, from_scheme, to_scheme): """Create a mapping from `from_scheme` to `to_scheme`.""" self.marks = {} self.virama = {} self.vowels = {} self.consonants = {} self.non_marks_viraama = {} self.from_scheme = from_scheme self.to_scheme = to_scheme self.max_key_length_from_scheme = max(len(x) for g in from_scheme for x in from_scheme[g]) for group in from_scheme.keys(): if group not in to_scheme.keys(): continue conjunct_map = {} for (k, v) in zip(from_scheme[group], to_scheme[group]): conjunct_map[k] = v if k in from_scheme.synonym_map: for k_syn in from_scheme.synonym_map[k]: conjunct_map[k_syn] = v if group.endswith('marks'): self.marks.update(conjunct_map) elif group == 'virama': self.virama = conjunct_map else: self.non_marks_viraama.update(conjunct_map) if group.endswith('consonants'): self.consonants.update(conjunct_map) elif group.endswith('vowels'): self.vowels.update(conjunct_map) if from_scheme.name == OPTITRANS: if len(to_scheme['virama']) == 0: to_scheme_virama = "" else: to_scheme_virama = to_scheme['virama'][0] conjunct_map = { "nk": self.consonants["~N"] + to_scheme_virama + self.consonants["k"], "nkh": self.consonants["~N"] + to_scheme_virama + self.consonants["kh"], "ng": self.consonants["~N"] + to_scheme_virama +self.consonants["g"], "ngh": self.consonants["~N"] + to_scheme_virama +self.consonants["gh"], "nch": self.consonants["~n"] + to_scheme_virama +self.consonants["ch"], "nCh": self.consonants["~n"] + to_scheme_virama +self.consonants["Ch"], "nj": self.consonants["~n"] + to_scheme_virama +self.consonants["j"], "njh": self.consonants["~n"] + to_scheme_virama +self.consonants["jh"], } self.consonants.update(conjunct_map) self.non_marks_viraama.update(conjunct_map) synonym_conjunct_map = {} for key in conjunct_map.keys(): latter_consonant = key[1:] if latter_consonant in from_scheme.synonym_map: for k_syn in from_scheme.synonym_map[latter_consonant]: synonym_conjunct_map["n" + k_syn] = conjunct_map[key] self.consonants.update(synonym_conjunct_map) self.non_marks_viraama.update(synonym_conjunct_map) if to_scheme.name == OPTITRANS: inv_map = {v: k for k, v in self.consonants.items()} if len(from_scheme['virama']) == 0: from_scheme_virama = '' else: from_scheme_virama = from_scheme['virama'][0] conjunct_map = { inv_map["~N"] + from_scheme_virama + inv_map["k"]: "nk", inv_map["~N"] + from_scheme_virama + inv_map["kh"]: "nkh", inv_map["~N"] + from_scheme_virama + inv_map["g"]: "ng", inv_map["~N"] + from_scheme_virama + inv_map["gh"]: "ngh", inv_map["~n"] + from_scheme_virama + inv_map["ch"]: "nch", inv_map["~n"] + from_scheme_virama + inv_map["Ch"]: "nCh", inv_map["~n"] + from_scheme_virama + inv_map["j"]: "nj", inv_map["~n"] + from_scheme_virama + inv_map["jh"]: "njh", } self.consonants.update(conjunct_map) self.non_marks_viraama.update(conjunct_map) def __str__(self): import pprint return pprint.pformat({"vowels": self.vowels, "marks": self.marks, "virama": self.virama, "consonants": self.consonants})
def _roman(data, scheme_map, **kw): """Transliterate `data` with the given `scheme_map`. This function is used when the source scheme is a Roman scheme. :param data: the data to transliterate :param scheme_map: a dict that maps between characters in the old scheme and characters in the new scheme """ vowels = scheme_map.vowels marks = scheme_map.marks virama = scheme_map.virama consonants = scheme_map.consonants non_marks_viraama = scheme_map.non_marks_viraama max_key_length_from_scheme = scheme_map.max_key_length_from_scheme to_roman = scheme_map.to_scheme.is_roman togglers = kw.pop('togglers', set()) suspend_on = kw.pop('suspend_on', set()) suspend_off = kw.pop('suspend_off', set()) if kw: raise TypeError('Unexpected keyword argument %s' % list(kw.keys())[0]) buf = [] i = 0 had_consonant = found = False len_data = len(data) append = buf.append # If true, don't transliterate. The toggle token is discarded. toggled = False # If true, don't transliterate. The suspend token is retained. # `suspended` overrides `toggled`. suspended = False while i <= len_data: # The longest token in the source scheme has length `max_key_length_from_scheme`. Iterate # over `data` while taking `max_key_length_from_scheme` characters at a time. If we don`t # find the character group in our scheme map, lop off a character and # try again. # # If we've finished reading through `data`, then `token` will be empty # and the loop below will be skipped. token = data[i:i + max_key_length_from_scheme] while token: if token in togglers: toggled = not toggled i += 2 # skip over the token found = True # force the token to fill up again break if token in suspend_on: suspended = True elif token in suspend_off: suspended = False if toggled or suspended: token = token[:-1] continue # Catch the pattern CV, where C is a consonant and V is a vowel. # V should be rendered as a vowel mark, a.k.a. a "dependent" # vowel. But due to the nature of Brahmic scripts, 'a' is implicit # and has no vowel mark. If we see 'a', add nothing. if had_consonant and token in vowels: mark = marks.get(token, '') if mark: append(mark) elif to_roman: append(vowels[token]) found = True # Catch any non_marks_viraama character, including consonants, punctuation, # and regular vowels. Due to the implicit 'a', we must explicitly # end any lingering consonants before we can handle the current # token. elif token in non_marks_viraama: if had_consonant: append(virama['']) append(non_marks_viraama[token]) found = True if found: had_consonant = token in consonants i += len(token) break else: token = token[:-1] # We've exhausted the token; this must be some other character. Due to # the implicit 'a', we must explicitly end any lingering consonants # before we can handle the current token. if not found: if had_consonant: append(virama['']) if i < len_data: append(data[i]) had_consonant = False i += 1 found = False return ''.join(buf) def _brahmic(data, scheme_map, **kw): """Transliterate `data` with the given `scheme_map`. This function is used when the source scheme is a Brahmic scheme. :param data: the data to transliterate :param scheme_map: a dict that maps between characters in the old scheme and characters in the new scheme """ marks = scheme_map.marks virama = scheme_map.virama consonants = scheme_map.consonants non_marks_viraama = scheme_map.non_marks_viraama to_roman = scheme_map.to_scheme.is_roman max_key_length_from_scheme = scheme_map.max_key_length_from_scheme buf = [] i = 0 to_roman_had_consonant = found = False append = buf.append # logging.debug(pprint.pformat(scheme_map.consonants)) # We dont just translate each brAhmic character one after another in order to prefer concise transliterations when possible - for example ज्ञ -> jn in optitrans rather than j~n. while i <= len(data): # The longest token in the source scheme has length `max_key_length_from_scheme`. Iterate # over `data` while taking `max_key_length_from_scheme` characters at a time. If we don`t # find the character group in our scheme map, lop off a character and # try again. # # If we've finished reading through `data`, then `token` will be empty # and the loop below will be skipped. token = data[i:i + max_key_length_from_scheme] while token: if len(token) == 1: if token in marks: append(marks[token]) found = True elif token in virama: append(virama[token]) found = True else: if to_roman_had_consonant: append('a') append(non_marks_viraama.get(token, token)) found = True else: if token in non_marks_viraama: if to_roman_had_consonant: append('a') append(non_marks_viraama.get(token)) found = True if found: to_roman_had_consonant = to_roman and token in consonants i += len(token) break else: token = token[:-1] # Continuing the outer while loop. # We've exhausted the token; this must be some other character. Due to # the implicit 'a', we must explicitly end any lingering consonants # before we can handle the current token. if not found: if to_roman_had_consonant: append(next(iter(virama.values()))) if i < len(data): append(data[i]) to_roman_had_consonant = False i += 1 found = False if to_roman_had_consonant: append('a') return ''.join(buf) @lru_cache(maxsize=8) def _get_scheme_map(input_encoding, output_encoding): """Provides a caching layer on top of `SchemeMap` objects to allow faster access to scheme maps we've instantiated once. :param input_encoding: Input encoding. Must be defined in `SCHEMES`. :param output_encoding: Input encoding. Must be defined in `SCHEMES`. """ return SchemeMap(SCHEMES[input_encoding], SCHEMES[output_encoding])
[docs]def transliterate(data, _from=None, _to=None, scheme_map=None, **kw): """Transliterate `data` with the given parameters:: output = transliterate('idam adbhutam', HK, DEVANAGARI) Each time the function is called, a new :class:`SchemeMap` is created to map the input scheme to the output scheme. This operation is fast enough for most use cases. But for higher performance, you can pass a pre-computed :class:`SchemeMap` instead:: scheme_map = SchemeMap(SCHEMES[HK], SCHEMES[DEVANAGARI]) output = transliterate('idam adbhutam', scheme_map=scheme_map) :param data: the data to transliterate :param scheme_map: the :class:`SchemeMap` to use. If specified, ignore `_from` and `_to`. If unspecified, create a :class:`SchemeMap` from `_from` to `_to`. """ if scheme_map is None: scheme_map = _get_scheme_map(_from, _to) options = { 'togglers': {'##'}, 'suspend_on': set('<'), 'suspend_off': set('>') } options.update(kw) func = _roman if scheme_map.from_scheme.is_roman else _brahmic return func(data, scheme_map, **options)
[docs]def get_standard_form(data, scheme_name): return transliterate(data=transliterate(data=data, _from=scheme_name, _to=DEVANAGARI), _from=DEVANAGARI, _to=scheme_name)
def _setup(): """Add a variety of default schemes.""" s = str.split if sys.version_info < (3, 0): # noinspection PyUnresolvedReferences s = unicode.split ## NOTE: See the Scheme constructor documentation for a few general notes while defining schemes. SCHEMES.update({ HK: roman.HkScheme(), VELTHUIS: roman.VelthiusScheme(), OPTITRANS: roman.OptitransScheme(), ITRANS: roman.ItransScheme(), IAST: roman.IastScheme(), KOLKATA: roman.IastScheme(kolkata_variant=True), SLP1: roman.Slp1Scheme(), WX: roman.WxScheme(), BENGALI: Scheme({ 'vowels': s("""অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ এ ঐ ও ঔ"""), 'marks': s("""া ি ী ু ূ ৃ ৄ ৢ ৣ ে ৈ ো ৌ"""), 'virama': s('্'), 'yogavaahas': s('ং ঃ ঁ'), 'consonants': s(""" ক খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড ঢ ণ ত থ দ ধ ন প ফ ব ভ ম য র ল ব শ ষ স হ ळ ক্ষ জ্ঞ """), 'symbols': s(""" ॐ ঽ । ॥ ০ ১ ২ ৩ ৪ ৫ ৬ ৭ ৮ ৯ """) }, is_roman=False, name=BENGALI), DEVANAGARI: Scheme({ 'vowels': s("""अ आ इ ई उ ऊ ऋ ॠ ऌ ॡ ए ऐ ओ औ"""), 'marks': s("""ा ि ी ु ू ृ ॄ ॢ ॣ े ै ो ौ"""), 'virama': s('्'), 'yogavaahas': s('ं ः ँ'), 'consonants': s(""" क ख ग घ ङ च छ ज झ ञ ट ठ ड ढ ण त थ द ध न प फ ब भ म य र ल व श ष स ह ळ क्ष ज्ञ """), 'symbols': s(""" ॐ ऽ । ॥ ० १ २ ३ ४ ५ ६ ७ ८ ९ """) }, is_roman=False, name=DEVANAGARI), GUJARATI: Scheme({ 'vowels': s("""અ આ ઇ ઈ ઉ ઊ ઋ ૠ ઌ ૡ એ ઐ ઓ ઔ"""), 'marks': s("""ા િ ી ુ ૂ ૃ ૄ ૢ ૣ ે ૈ ો ૌ"""), 'virama': s('્'), 'yogavaahas': s('ં ઃ ઁ'), 'consonants': s(""" ક ખ ગ ઘ ઙ ચ છ જ ઝ ઞ ટ ઠ ડ ઢ ણ ત થ દ ધ ન પ ફ બ ભ મ ય ર લ વ શ ષ સ હ ળ ક્ષ જ્ઞ """), 'symbols': s(""" ૐ ઽ ૤ ૥ ૦ ૧ ૨ ૩ ૪ ૫ ૬ ૭ ૮ ૯ """) }, is_roman=False, name=GUJARATI), GURMUKHI: Scheme({ 'vowels': s("""ਅ ਆ ਇ ਈ ਉ ਊ ऋ ॠ ऌ ॡ ਏ ਐ ਓ ਔ"""), 'marks': ['ਾ', 'ਿ', 'ੀ', 'ੁ', 'ੂ', '', '', '', '', 'ੇ', 'ੈ', 'ੋ', 'ੌ'], 'virama': s('੍'), 'yogavaahas': s('ਂ ਃ ਁ'), 'consonants': s(""" ਕ ਖ ਗ ਘ ਙ ਚ ਛ ਜ ਝ ਞ ਟ ਠ ਡ ਢ ਣ ਤ ਥ ਦ ਧ ਨ ਪ ਫ ਬ ਭ ਮ ਯ ਰ ਲ ਵ ਸ਼ ਸ਼ ਸ ਹ ਲ਼ ਕ੍ਸ਼ ਜ੍ਞ """), 'symbols': s(""" ੴ ఽ । ॥ ੦ ੧ ੨ ੩ ੪ ੫ ੬ ੭ ੮ ੯ """) }, is_roman=False, name=GURMUKHI), KANNADA: Scheme({ 'vowels': s("""ಅ ಆ ಇ ಈ ಉ ಊ ಋ ೠ ಌ ೡ ಏ ಐ ಓ ಔ"""), 'marks': s("""ಾ ಿ ೀ ು ೂ ೃ ೄ ೢ ೣ ೇ ೈ ೋ ೌ"""), 'virama': s('್'), 'yogavaahas': s('ಂ ಃ ँ'), 'consonants': s(""" ಕ ಖ ಗ ಘ ಙ ಚ ಛ ಜ ಝ ಞ ಟ ಠ ಡ ಢ ಣ ತ ಥ ದ ಧ ನ ಪ ಫ ಬ ಭ ಮ ಯ ರ ಲ ವ ಶ ಷ ಸ ಹ ಳ ಕ್ಷ ಜ್ಞ """), 'symbols': s(""" ಓಂ ऽ । ॥ ೦ ೧ ೨ ೩ ೪ ೫ ೬ ೭ ೮ ೯ """) }, is_roman=False, name=KANNADA), MALAYALAM: Scheme({ 'vowels': s("""അ ആ ഇ ഈ ഉ ഊ ഋ ൠ ഌ ൡ ഏ ഐ ഓ ഔ"""), 'marks': s("""ാ ി ീ ു ൂ ൃ ൄ ൢ ൣ േ ൈ ോ ൌ"""), 'virama': s('്'), 'yogavaahas': s('ം ഃ ँ'), 'consonants': s(""" ക ഖ ഗ ഘ ങ ച ഛ ജ ഝ ഞ ട ഠ ഡ ഢ ണ ത ഥ ദ ധ ന പ ഫ ബ ഭ മ യ ര ല വ ശ ഷ സ ഹ ള ക്ഷ ജ്ഞ """), 'symbols': s(""" ഓം ഽ । ॥ ൦ ൧ ൨ ൩ ൪ ൫ ൬ ൭ ൮ ൯ """) }, is_roman=False, name=MALAYALAM), ORIYA: Scheme({ 'vowels': s("""ଅ ଆ ଇ ଈ ଉ ଊ ଋ ୠ ଌ ୡ ଏ ଐ ଓ ଔ"""), 'marks': ['ା', 'ି', 'ୀ', 'ୁ', 'ୂ', 'ୃ', 'ୄ', '', '', 'େ', 'ୈ', 'ୋ', 'ୌ'], 'virama': s('୍'), 'yogavaahas': s('ଂ ଃ ଁ'), 'consonants': s(""" କ ଖ ଗ ଘ ଙ ଚ ଛ ଜ ଝ ଞ ଟ ଠ ଡ ଢ ଣ ତ ଥ ଦ ଧ ନ ପ ଫ ବ ଭ ମ ଯ ର ଲ ଵ ଶ ଷ ସ ହ ଳ କ୍ଷ ଜ୍ଞ """), 'symbols': s(""" ଓଂ ଽ । ॥ ୦ ୧ ୨ ୩ ୪ ୫ ୬ ୭ ୮ ୯ """) }, is_roman=False, name=ORIYA), TAMIL: Scheme({ 'vowels': s("""அ ஆ இ ஈ உ ஊ ऋ ॠ ऌ ॡ ஏ ஐ ஓ ஔ"""), 'marks': ['ா', 'ி', 'ீ', 'ு', 'ூ', '', '', '', '', 'ே', 'ை', 'ோ', 'ௌ'], 'virama': s('்'), 'yogavaahas': s('ஂ ஃ ँ'), 'consonants': s(""" க க க க ங ச ச ஜ ச ஞ ட ட ட ட ண த த த த ந ப ப ப ப ம ய ர ல வ ஶ ஷ ஸ ஹ ள க்ஷ ஜ்ஞ """), 'symbols': s(""" ௐ ऽ । ॥ ௦ ௧ ௨ ௩ ௪ ௫ ௬ ௭ ௮ ௯ """) }, is_roman=False, name=TAMIL), TELUGU: Scheme({ 'vowels': s("""అ ఆ ఇ ఈ ఉ ఊ ఋ ౠ ఌ ౡ ఏ ఐ ఓ ఔ"""), 'marks': s("""ా ి ీ ు ూ ృ ౄ ౢ ౣ ే ై ో ౌ"""), 'virama': s('్'), 'yogavaahas': s('ం ః ఁ'), 'consonants': s(""" క ఖ గ ఘ ఙ చ ఛ జ ఝ ఞ ట ఠ డ ఢ ణ త థ ద ధ న ప ఫ బ భ మ య ర ల వ శ ష స హ ళ క్ష జ్ఞ """), 'symbols': s(""" ఓం ఽ । ॥ ౦ ౧ ౨ ౩ ౪ ౫ ౬ ౭ ౮ ౯ """) }, is_roman=False, name=TELUGU) }) _setup()