# -*- coding: utf-8 -*-
"""
Transliteration functions for Sanskrit. The most important function is
:func:`transliterate`, which is very easy to use::
output = transliterate(data, IAST, DEVANAGARI)
By default, the module supports the following scripts:
- Bengali
- Devanagari
- Gujarati
- Kannada
- Malayalam
- Telugu
- Tamil
- Oriya
- Gurmukhi/ Punjabi/ Panjabi
and the following romanizations:
- HK = 'hk'
- IAST = 'iast'
- ITRANS = 'itrans'
- OPTITRANS = 'optitrans'
- KOLKATA = 'kolkata'
- SLP1 = 'slp1'
- VELTHUIS = 'velthuis'
- WX = 'wx'
Each of these **schemes** is defined in a global dictionary `SCHEMES`, whose
keys are strings::
devanagari_scheme = SCHEMES['devanagari']
For convenience, we also define a variable for each scheme::
devanagari_scheme = SCHEMES[DEVANAGARI]
These variables are documented below.
:license: MIT and BSD
.. _Bengali: http://en.wikipedia.org/wiki/Bengali_alphabet
.. _Devanagari: http://en.wikipedia.org/wiki/Devanagari
.. _Gujarati: http://en.wikipedia.org/wiki/Gujarati_alphabet
.. _Kannada: http://en.wikipedia.org/wiki/Kannada_alphabet
.. _Malayalam: http://en.wikipedia.org/wiki/Malayalam_alphabet
.. _Telugu: http://en.wikipedia.org/wiki/Telugu_alphabet
.. _Harvard-Kyoto: http://en.wikipedia.org/wiki/Harvard-Kyoto
.. _IAST: http://en.wikipedia.org/wiki/IAST
"""
from __future__ import unicode_literals
# Brahmic schemes
# ---------------
#: Internal name of Bengali. Bengali ``ba`` and ``va`` are both rendered
#: as `ব`.
import sys
from indic_transliteration.sanscript import schemes
from indic_transliteration.sanscript.schemes import Scheme
from indic_transliteration.sanscript.schemes import roman
from indic_transliteration.sanscript.schemes import brahmi
try:
from functools import lru_cache
except ImportError:
from backports.functools_lru_cache import lru_cache
# These variables are replicated here for backward compatibility.
# -------------
BENGALI = brahmi.BENGALI
DEVANAGARI = brahmi.DEVANAGARI
GUJARATI = brahmi.GUJARATI
GURMUKHI = brahmi.GURMUKHI
KANNADA = brahmi.KANNADA
MALAYALAM = brahmi.MALAYALAM
ORIYA = brahmi.ORIYA
TAMIL = brahmi.TAMIL
TELUGU = brahmi.TELUGU
HK = roman.HK
IAST = roman.IAST
ITRANS = roman.ITRANS
OPTITRANS = roman.OPTITRANS
KOLKATA = roman.KOLKATA
SLP1 = roman.SLP1
VELTHUIS = roman.VELTHUIS
WX = roman.WX
SCHEMES = {}
[docs]class SchemeMap(object):
"""Maps one :class:`Scheme` to another. This class grabs the metadata and
character data required for :func:`transliterate`.
:param from_scheme: the source scheme
:param to_scheme: the destination scheme
"""
def __init__(self, from_scheme, to_scheme):
"""Create a mapping from `from_scheme` to `to_scheme`."""
self.marks = {}
self.virama = {}
self.vowels = {}
self.consonants = {}
self.non_marks_viraama = {}
self.from_scheme = from_scheme
self.to_scheme = to_scheme
self.max_key_length_from_scheme = max(len(x) for g in from_scheme
for x in from_scheme[g])
for group in from_scheme.keys():
if group not in to_scheme.keys():
continue
conjunct_map = {}
for (k, v) in zip(from_scheme[group], to_scheme[group]):
conjunct_map[k] = v
if k in from_scheme.synonym_map:
for k_syn in from_scheme.synonym_map[k]:
conjunct_map[k_syn] = v
if group.endswith('marks'):
self.marks.update(conjunct_map)
elif group == 'virama':
self.virama = conjunct_map
else:
self.non_marks_viraama.update(conjunct_map)
if group.endswith('consonants'):
self.consonants.update(conjunct_map)
elif group.endswith('vowels'):
self.vowels.update(conjunct_map)
if from_scheme.name == OPTITRANS:
if len(to_scheme['virama']) == 0:
to_scheme_virama = ""
else:
to_scheme_virama = to_scheme['virama'][0]
conjunct_map = {
"nk": self.consonants["~N"] + to_scheme_virama + self.consonants["k"],
"nkh": self.consonants["~N"] + to_scheme_virama + self.consonants["kh"],
"ng": self.consonants["~N"] + to_scheme_virama +self.consonants["g"],
"ngh": self.consonants["~N"] + to_scheme_virama +self.consonants["gh"],
"nch": self.consonants["~n"] + to_scheme_virama +self.consonants["ch"],
"nCh": self.consonants["~n"] + to_scheme_virama +self.consonants["Ch"],
"nj": self.consonants["~n"] + to_scheme_virama +self.consonants["j"],
"njh": self.consonants["~n"] + to_scheme_virama +self.consonants["jh"],
}
self.consonants.update(conjunct_map)
self.non_marks_viraama.update(conjunct_map)
synonym_conjunct_map = {}
for key in conjunct_map.keys():
latter_consonant = key[1:]
if latter_consonant in from_scheme.synonym_map:
for k_syn in from_scheme.synonym_map[latter_consonant]:
synonym_conjunct_map["n" + k_syn] = conjunct_map[key]
self.consonants.update(synonym_conjunct_map)
self.non_marks_viraama.update(synonym_conjunct_map)
if to_scheme.name == OPTITRANS:
inv_map = {v: k for k, v in self.consonants.items()}
if len(from_scheme['virama']) == 0:
from_scheme_virama = ''
else:
from_scheme_virama = from_scheme['virama'][0]
conjunct_map = {
inv_map["~N"] + from_scheme_virama + inv_map["k"]: "nk",
inv_map["~N"] + from_scheme_virama + inv_map["kh"]: "nkh",
inv_map["~N"] + from_scheme_virama + inv_map["g"]: "ng",
inv_map["~N"] + from_scheme_virama + inv_map["gh"]: "ngh",
inv_map["~n"] + from_scheme_virama + inv_map["ch"]: "nch",
inv_map["~n"] + from_scheme_virama + inv_map["Ch"]: "nCh",
inv_map["~n"] + from_scheme_virama + inv_map["j"]: "nj",
inv_map["~n"] + from_scheme_virama + inv_map["jh"]: "njh",
}
self.consonants.update(conjunct_map)
self.non_marks_viraama.update(conjunct_map)
def __str__(self):
import pprint
return pprint.pformat({"vowels": self.vowels,
"marks": self.marks,
"virama": self.virama,
"consonants": self.consonants})
def _roman(data, scheme_map, **kw):
"""Transliterate `data` with the given `scheme_map`. This function is used
when the source scheme is a Roman scheme.
:param data: the data to transliterate
:param scheme_map: a dict that maps between characters in the old scheme
and characters in the new scheme
"""
vowels = scheme_map.vowels
marks = scheme_map.marks
virama = scheme_map.virama
consonants = scheme_map.consonants
non_marks_viraama = scheme_map.non_marks_viraama
max_key_length_from_scheme = scheme_map.max_key_length_from_scheme
to_roman = scheme_map.to_scheme.is_roman
togglers = kw.pop('togglers', set())
suspend_on = kw.pop('suspend_on', set())
suspend_off = kw.pop('suspend_off', set())
if kw:
raise TypeError('Unexpected keyword argument %s' % list(kw.keys())[0])
buf = []
i = 0
had_consonant = found = False
len_data = len(data)
append = buf.append
# If true, don't transliterate. The toggle token is discarded.
toggled = False
# If true, don't transliterate. The suspend token is retained.
# `suspended` overrides `toggled`.
suspended = False
while i <= len_data:
# The longest token in the source scheme has length `max_key_length_from_scheme`. Iterate
# over `data` while taking `max_key_length_from_scheme` characters at a time. If we don`t
# find the character group in our scheme map, lop off a character and
# try again.
#
# If we've finished reading through `data`, then `token` will be empty
# and the loop below will be skipped.
token = data[i:i + max_key_length_from_scheme]
while token:
if token in togglers:
toggled = not toggled
i += 2 # skip over the token
found = True # force the token to fill up again
break
if token in suspend_on:
suspended = True
elif token in suspend_off:
suspended = False
if toggled or suspended:
token = token[:-1]
continue
# Catch the pattern CV, where C is a consonant and V is a vowel.
# V should be rendered as a vowel mark, a.k.a. a "dependent"
# vowel. But due to the nature of Brahmic scripts, 'a' is implicit
# and has no vowel mark. If we see 'a', add nothing.
if had_consonant and token in vowels:
mark = marks.get(token, '')
if mark:
append(mark)
elif to_roman:
append(vowels[token])
found = True
# Catch any non_marks_viraama character, including consonants, punctuation,
# and regular vowels. Due to the implicit 'a', we must explicitly
# end any lingering consonants before we can handle the current
# token.
elif token in non_marks_viraama:
if had_consonant:
append(virama[''])
append(non_marks_viraama[token])
found = True
if found:
had_consonant = token in consonants
i += len(token)
break
else:
token = token[:-1]
# We've exhausted the token; this must be some other character. Due to
# the implicit 'a', we must explicitly end any lingering consonants
# before we can handle the current token.
if not found:
if had_consonant:
append(virama[''])
if i < len_data:
append(data[i])
had_consonant = False
i += 1
found = False
return ''.join(buf)
def _brahmic(data, scheme_map, **kw):
"""Transliterate `data` with the given `scheme_map`. This function is used
when the source scheme is a Brahmic scheme.
:param data: the data to transliterate
:param scheme_map: a dict that maps between characters in the old scheme
and characters in the new scheme
"""
marks = scheme_map.marks
virama = scheme_map.virama
consonants = scheme_map.consonants
non_marks_viraama = scheme_map.non_marks_viraama
to_roman = scheme_map.to_scheme.is_roman
max_key_length_from_scheme = scheme_map.max_key_length_from_scheme
buf = []
i = 0
to_roman_had_consonant = found = False
append = buf.append
# logging.debug(pprint.pformat(scheme_map.consonants))
# We dont just translate each brAhmic character one after another in order to prefer concise transliterations when possible - for example ज्ञ -> jn in optitrans rather than j~n.
while i <= len(data):
# The longest token in the source scheme has length `max_key_length_from_scheme`. Iterate
# over `data` while taking `max_key_length_from_scheme` characters at a time. If we don`t
# find the character group in our scheme map, lop off a character and
# try again.
#
# If we've finished reading through `data`, then `token` will be empty
# and the loop below will be skipped.
token = data[i:i + max_key_length_from_scheme]
while token:
if len(token) == 1:
if token in marks:
append(marks[token])
found = True
elif token in virama:
append(virama[token])
found = True
else:
if to_roman_had_consonant:
append('a')
append(non_marks_viraama.get(token, token))
found = True
else:
if token in non_marks_viraama:
if to_roman_had_consonant:
append('a')
append(non_marks_viraama.get(token))
found = True
if found:
to_roman_had_consonant = to_roman and token in consonants
i += len(token)
break
else:
token = token[:-1]
# Continuing the outer while loop.
# We've exhausted the token; this must be some other character. Due to
# the implicit 'a', we must explicitly end any lingering consonants
# before we can handle the current token.
if not found:
if to_roman_had_consonant:
append(next(iter(virama.values())))
if i < len(data):
append(data[i])
to_roman_had_consonant = False
i += 1
found = False
if to_roman_had_consonant:
append('a')
return ''.join(buf)
@lru_cache(maxsize=8)
def _get_scheme_map(input_encoding, output_encoding):
"""Provides a caching layer on top of `SchemeMap` objects to allow faster
access to scheme maps we've instantiated once.
:param input_encoding: Input encoding. Must be defined in `SCHEMES`.
:param output_encoding: Input encoding. Must be defined in `SCHEMES`.
"""
return SchemeMap(SCHEMES[input_encoding], SCHEMES[output_encoding])
[docs]def transliterate(data, _from=None, _to=None, scheme_map=None, **kw):
"""Transliterate `data` with the given parameters::
output = transliterate('idam adbhutam', HK, DEVANAGARI)
Each time the function is called, a new :class:`SchemeMap` is created
to map the input scheme to the output scheme. This operation is fast
enough for most use cases. But for higher performance, you can pass a
pre-computed :class:`SchemeMap` instead::
scheme_map = SchemeMap(SCHEMES[HK], SCHEMES[DEVANAGARI])
output = transliterate('idam adbhutam', scheme_map=scheme_map)
:param data: the data to transliterate
:param scheme_map: the :class:`SchemeMap` to use. If specified, ignore
`_from` and `_to`. If unspecified, create a
:class:`SchemeMap` from `_from` to `_to`.
"""
if scheme_map is None:
scheme_map = _get_scheme_map(_from, _to)
options = {
'togglers': {'##'},
'suspend_on': set('<'),
'suspend_off': set('>')
}
options.update(kw)
func = _roman if scheme_map.from_scheme.is_roman else _brahmic
return func(data, scheme_map, **options)
def _setup():
"""Add a variety of default schemes."""
s = str.split
if sys.version_info < (3, 0):
# noinspection PyUnresolvedReferences
s = unicode.split
## NOTE: See the Scheme constructor documentation for a few general notes while defining schemes.
SCHEMES.update({
HK: roman.HkScheme(),
VELTHUIS: roman.VelthiusScheme(),
OPTITRANS: roman.OptitransScheme(),
ITRANS: roman.ItransScheme(),
IAST: roman.IastScheme(),
KOLKATA: roman.IastScheme(kolkata_variant=True),
SLP1: roman.Slp1Scheme(),
WX: roman.WxScheme(),
BENGALI: Scheme({
'vowels': s("""অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ এ ঐ ও ঔ"""),
'marks': s("""া ি ী ু ূ ৃ ৄ ৢ ৣ ে ৈ ো ৌ"""),
'virama': s('্'),
'yogavaahas': s('ং ঃ ঁ'),
'consonants': s("""
ক খ গ ঘ ঙ
চ ছ জ ঝ ঞ
ট ঠ ড ঢ ণ
ত থ দ ধ ন
প ফ ব ভ ম
য র ল ব
শ ষ স হ
ळ ক্ষ জ্ঞ
"""),
'symbols': s("""
ॐ ঽ । ॥
০ ১ ২ ৩ ৪ ৫ ৬ ৭ ৮ ৯
""")
}, is_roman=False, name=BENGALI),
DEVANAGARI: Scheme({
'vowels': s("""अ आ इ ई उ ऊ ऋ ॠ ऌ ॡ ए ऐ ओ औ"""),
'marks': s("""ा ि ी ु ू ृ ॄ ॢ ॣ े ै ो ौ"""),
'virama': s('्'),
'yogavaahas': s('ं ः ँ'),
'consonants': s("""
क ख ग घ ङ
च छ ज झ ञ
ट ठ ड ढ ण
त थ द ध न
प फ ब भ म
य र ल व
श ष स ह
ळ क्ष ज्ञ
"""),
'symbols': s("""
ॐ ऽ । ॥
० १ २ ३ ४ ५ ६ ७ ८ ९
""")
}, is_roman=False, name=DEVANAGARI),
GUJARATI: Scheme({
'vowels': s("""અ આ ઇ ઈ ઉ ઊ ઋ ૠ ઌ ૡ એ ઐ ઓ ઔ"""),
'marks': s("""ા િ ી ુ ૂ ૃ ૄ ૢ ૣ ે ૈ ો ૌ"""),
'virama': s('્'),
'yogavaahas': s('ં ઃ ઁ'),
'consonants': s("""
ક ખ ગ ઘ ઙ
ચ છ જ ઝ ઞ
ટ ઠ ડ ઢ ણ
ત થ દ ધ ન
પ ફ બ ભ મ
ય ર લ વ
શ ષ સ હ
ળ ક્ષ જ્ઞ
"""),
'symbols': s("""
ૐ ઽ
૦ ૧ ૨ ૩ ૪ ૫ ૬ ૭ ૮ ૯
""")
}, is_roman=False, name=GUJARATI),
GURMUKHI: Scheme({
'vowels': s("""ਅ ਆ ਇ ਈ ਉ ਊ ऋ ॠ ऌ ॡ ਏ ਐ ਓ ਔ"""),
'marks': ['ਾ', 'ਿ', 'ੀ', 'ੁ', 'ੂ', '', '',
'', '', 'ੇ', 'ੈ', 'ੋ', 'ੌ'],
'virama': s('੍'),
'yogavaahas': s('ਂ ਃ ਁ'),
'consonants': s("""
ਕ ਖ ਗ ਘ ਙ
ਚ ਛ ਜ ਝ ਞ
ਟ ਠ ਡ ਢ ਣ
ਤ ਥ ਦ ਧ ਨ
ਪ ਫ ਬ ਭ ਮ
ਯ ਰ ਲ ਵ
ਸ਼ ਸ਼ ਸ ਹ
ਲ਼ ਕ੍ਸ਼ ਜ੍ਞ
"""),
'symbols': s("""
ੴ ఽ । ॥
੦ ੧ ੨ ੩ ੪ ੫ ੬ ੭ ੮ ੯
""")
}, is_roman=False, name=GURMUKHI),
KANNADA: Scheme({
'vowels': s("""ಅ ಆ ಇ ಈ ಉ ಊ ಋ ೠ ಌ ೡ ಏ ಐ ಓ ಔ"""),
'marks': s("""ಾ ಿ ೀ ು ೂ ೃ ೄ ೢ ೣ ೇ ೈ ೋ ೌ"""),
'virama': s('್'),
'yogavaahas': s('ಂ ಃ ँ'),
'consonants': s("""
ಕ ಖ ಗ ಘ ಙ
ಚ ಛ ಜ ಝ ಞ
ಟ ಠ ಡ ಢ ಣ
ತ ಥ ದ ಧ ನ
ಪ ಫ ಬ ಭ ಮ
ಯ ರ ಲ ವ
ಶ ಷ ಸ ಹ
ಳ ಕ್ಷ ಜ್ಞ
"""),
'symbols': s("""
ಓಂ ऽ । ॥
೦ ೧ ೨ ೩ ೪ ೫ ೬ ೭ ೮ ೯
""")
}, is_roman=False, name=KANNADA),
MALAYALAM: Scheme({
'vowels': s("""അ ആ ഇ ഈ ഉ ഊ ഋ ൠ ഌ ൡ ഏ ഐ ഓ ഔ"""),
'marks': s("""ാ ി ീ ു ൂ ൃ ൄ ൢ ൣ േ ൈ ോ ൌ"""),
'virama': s('്'),
'yogavaahas': s('ം ഃ ँ'),
'consonants': s("""
ക ഖ ഗ ഘ ങ
ച ഛ ജ ഝ ഞ
ട ഠ ഡ ഢ ണ
ത ഥ ദ ധ ന
പ ഫ ബ ഭ മ
യ ര ല വ
ശ ഷ സ ഹ
ള ക്ഷ ജ്ഞ
"""),
'symbols': s("""
ഓം ഽ । ॥
൦ ൧ ൨ ൩ ൪ ൫ ൬ ൭ ൮ ൯
""")
}, is_roman=False, name=MALAYALAM),
ORIYA: Scheme({
'vowels': s("""ଅ ଆ ଇ ଈ ଉ ଊ ଋ ୠ ଌ ୡ ଏ ଐ ଓ ଔ"""),
'marks': ['ା', 'ି', 'ୀ', 'ୁ', 'ୂ', 'ୃ', 'ୄ',
'', '', 'େ', 'ୈ', 'ୋ', 'ୌ'],
'virama': s('୍'),
'yogavaahas': s('ଂ ଃ ଁ'),
'consonants': s("""
କ ଖ ଗ ଘ ଙ
ଚ ଛ ଜ ଝ ଞ
ଟ ଠ ଡ ଢ ଣ
ତ ଥ ଦ ଧ ନ
ପ ଫ ବ ଭ ମ
ଯ ର ଲ ଵ
ଶ ଷ ସ ହ
ଳ କ୍ଷ ଜ୍ଞ
"""),
'symbols': s("""
ଓଂ ଽ । ॥
୦ ୧ ୨ ୩ ୪ ୫ ୬ ୭ ୮ ୯
""")
}, is_roman=False, name=ORIYA),
TAMIL: Scheme({
'vowels': s("""அ ஆ இ ஈ உ ஊ ऋ ॠ ऌ ॡ ஏ ஐ ஓ ஔ"""),
'marks': ['ா', 'ி', 'ீ', 'ு', 'ூ', '', '',
'', '', 'ே', 'ை', 'ோ', 'ௌ'],
'virama': s('்'),
'yogavaahas': s('ஂ ஃ ँ'),
'consonants': s("""
க க க க ங
ச ச ஜ ச ஞ
ட ட ட ட ண
த த த த ந
ப ப ப ப ம
ய ர ல வ
ஶ ஷ ஸ ஹ
ள க்ஷ ஜ்ஞ
"""),
'symbols': s("""
ௐ ऽ । ॥
௦ ௧ ௨ ௩ ௪ ௫ ௬ ௭ ௮ ௯
""")
}, is_roman=False, name=TAMIL),
TELUGU: Scheme({
'vowels': s("""అ ఆ ఇ ఈ ఉ ఊ ఋ ౠ ఌ ౡ ఏ ఐ ఓ ఔ"""),
'marks': s("""ా ి ీ ు ూ ృ ౄ ౢ ౣ ే ై ో ౌ"""),
'virama': s('్'),
'yogavaahas': s('ం ః ఁ'),
'consonants': s("""
క ఖ గ ఘ ఙ
చ ఛ జ ఝ ఞ
ట ఠ డ ఢ ణ
త థ ద ధ న
ప ఫ బ భ మ
య ర ల వ
శ ష స హ
ళ క్ష జ్ఞ
"""),
'symbols': s("""
ఓం ఽ । ॥
౦ ౧ ౨ ౩ ౪ ౫ ౬ ౭ ౮ ౯
""")
}, is_roman=False, name=TELUGU)
})
_setup()