Source code for indic_transliteration.detect
# -*- coding: utf-8 -*-
"""
detect
~~~~~~
Code for automatically detecting a transliteration scheme.
:license: MIT and BSD
Example usage:
::
from indic_transliteration import detect
detect.detect('pitRRIn') == Scheme.ITRANS
detect.detect('pitRRn') == Scheme.HK
When handling a Sanskrit string, it's almost always best to explicitly
state its transliteration scheme. This avoids embarrassing errors with
words like ``pitRRIn``. But most of the time, it's possible to infer the
encoding from the text itself.
``detect.py`` automatically detects a string's transliteration scheme:
::
detect('pitRRIn') == Scheme.ITRANS
detect('pitRRn') == Scheme.HK
detect('pitFn') == Scheme.SLP1
detect('पितॄन्') == Scheme.Devanagari
detect('পিতৄন্') == Scheme.Bengali
Supported schemes
-----------------
All schemes are attributes on the ``Scheme`` class. You can also just
use the scheme name:
::
Scheme.IAST == 'IAST'
Scheme.Devanagari == 'Devanagari'
Scripts:
- Bengali (``'Bengali'``)
- Devanagari (``'Devanagari'``)
- Gujarati (``'Gujarati'``)
- Gurmukhi (``'Gurmukhi'``)
- Kannada (``'Kannada'``)
- Malayalam (``'Malayalam'``)
- Oriya (``'Oriya'``)
- Tamil (``'Tamil'``)
- Telugu (``'Telugu'``)
Romanizations:
- Harvard-Kyoto (``'HK'``)
- IAST (``'IAST'``)
- ITRANS (``'ITRANS'``)
- Kolkata (``'Kolkata'``)
- SLP1 (``'SLP1'``)
- Velthuis (``'Velthuis'``)
"""
import re
#: Scheme data. This is split into separate classes, but here it's DRY.
import sys
SCHEMES = [
('Bengali', 0x0980),
('Devanagari', 0x0900),
('Gujarati', 0x0a80),
('Gurmukhi', 0x0a00),
('Kannada', 0x0c80),
('Malayalam', 0x0d00),
('Oriya', 0x0b00),
('Tamil', 0x0b80),
('Telugu', 0x0c00),
('HK', None),
('IAST', None),
('ITRANS', None),
('Kolkata', None),
('SLP1', None),
('Velthuis', None),
]
#: Start of the Devanagari block.
BRAHMIC_FIRST_CODE_POINT = 0x0900
#: End of the Malayalam block.
BRAHMIC_LAST_CODE_POINT = 0x0d7f
#: Schemes sorted by Unicode code point. Ignore schemes with none defined.
BLOCKS = sorted([x for x in SCHEMES if x[-1]], key=lambda x: -x[1])
#: Enum for Sanskrit schemes.
Scheme = type('Enum', (), {name: name for name, code in SCHEMES})
[docs]class Regex:
#: Match on special Roman characters
IAST_OR_KOLKATA_ONLY = re.compile(u'[āīūṛṝḷḹēōṃḥṅñṭḍṇśṣḻ]')
#: Match on chars shared by ITRANS and Velthuis
ITRANS_OR_VELTHUIS_ONLY = re.compile(u'aa|ii|uu|~n')
#: Match on ITRANS-only
ITRANS_ONLY = re.compile(u'ee|oo|\^[iI]|RR[iI]|L[iI]|'
u'~N|N\^|Ch|chh|JN|sh|Sh|\\.a')
#: Match on Kolkata-specific Roman characters
KOLKATA_ONLY = re.compile(u'[ēō]')
#: Match on SLP1-only characters and bigrams
SLP1_ONLY = re.compile(u'[fFxXEOCYwWqQPB]|kz|Nk|Ng|tT|dD|Sc|Sn|'
u'[aAiIuUfFxXeEoO]R|'
u'G[yr]|(\\W|^)G')
#: Match on Velthuis-only characters
VELTHUIS_ONLY = re.compile(u'\\.[mhnrltds]|"n|~s')
# noinspection PyUnresolvedReferences
[docs]def detect(text):
"""Detect the input's transliteration scheme.
:param text: some text data, either a `unicode` or a `str` encoded
in UTF-8.
"""
if sys.version_info < (3, 0):
# Verify encoding
try:
text = text.decode('utf-8')
except UnicodeError:
pass
# Brahmic schemes are all within a specific range of code points.
for L in text:
code = ord(L)
if code >= BRAHMIC_FIRST_CODE_POINT:
for name, start_code in BLOCKS:
if start_code <= code <= BRAHMIC_LAST_CODE_POINT:
return name
# Romanizations
if Regex.IAST_OR_KOLKATA_ONLY.search(text):
if Regex.KOLKATA_ONLY.search(text):
return Scheme.Kolkata
else:
return Scheme.IAST
if Regex.ITRANS_ONLY.search(text):
return Scheme.ITRANS
if Regex.SLP1_ONLY.search(text):
return Scheme.SLP1
if Regex.VELTHUIS_ONLY.search(text):
return Scheme.Velthuis
if Regex.ITRANS_OR_VELTHUIS_ONLY.search(text):
return Scheme.ITRANS
return Scheme.HK