Source code for indic_transliteration.deduplication

# -*- coding: utf-8 -*-
"""
Some useful functions for converting and disambiguating between common alternative orthographies (ways of writing) the same text.
"""

import logging

# Not using the more standard library re here : We need to support `key = re.sub("\\P{IsDevanagari}", "", key)`.
import regex
from indic_transliteration import sanscript
from indic_transliteration.sanscript.schemes import roman 


[docs]def fix_lazy_anusvaara_itrans(data_in):
    return roman.ItransScheme.fix_lazy_anusvaara(data_in=data_in)

[docs]def get_approx_deduplicating_key(text, encoding_scheme=sanscript.DEVANAGARI):
    """
    Given some devanAgarI sanskrit text, this function produces a "key" so
    that
  
    1] The key should be the same for different observed orthographical
    forms of the same text. For example:
  
    ::
  
        - "dharmma" vs "dharma"
        - "rAmaM gacChati" vs "rAma~N gacChati" vs "rAma~N gacChati"
        - "kurvan eva" vs "kurvanneva"
  
    2] The key should be different for different for different texts.
  
    -  "stamba" vs "stambha"
  
    This function attempts to succeed at [1] and [2] almostall the time.
    Longer the text, probability of failing at [2] decreases, while
    probability of failing at [1] increases (albeit very slightly).
  
    Sources of orthographically divergent forms:
  
    -  Phonetically sensible grammar rules
    -  Neglect of sandhi while writing
    -  Punctuation, spaces, avagraha-s.
    -  Regional-language-influenced mistakes (La instead of la.)
  
    Some example applications of this function:
  
    -  Create a database of quotes or words with minimal duplication.
    -  Search a database of quotes or words while being robust to optional
       forms.
  
    Also see equivalent function in the scala indic-transliteration package.
    """
    if encoding_scheme == sanscript.DEVANAGARI:
        key = text
        key = regex.sub("\\P{IsDevanagari}", "", key)
        # Remove spaces
        key = regex.sub("\\s", "", key)
        # Remove punctuations
        key = regex.sub("\\p{P}", "", key)
        # Remove digits, abbreviation sign, svara-s, avagraha
        key = regex.sub("[०-९।॥॰ऽ]|[॑-॔]", "", key)
        # Collapse semi-vowel-anunAsika-s संलग्नम् सल्ँलग्नम् into m
        key = regex.sub("[यरल]्ँ", "म्", key)
        # Collapse all panchama-s into m
        key = regex.sub("[ङञणन]", "म", key)
        # Collapse anusvAra into m
        key = regex.sub("ँ|ं", "म्", key)
        key = regex.sub("ॐ", "ओम्", key)
        key = regex.sub("[ळऴ]", "ल", key)
        # Deal with optional forms where consonants are duplicated - like dharmma
        # Details in https://docs.google.com/spreadsheets/d/1GP8Ps_hmgCGLZPWKIVBCfQB9ZmPQOaCwTrH9OybaWaQ/edit#gid=21
        key = regex.sub("([क-हक़-य़])्\\1+", "\\1", key)
        key = regex.sub("[कग]्ख्", "ख्", key)
        key = regex.sub("[कग]्घ्", "घ्", key)
        key = regex.sub("च्छ्", "छ्", key)
        key = regex.sub("ज्झ्", "झ्", key)
        key = regex.sub("त्थ्", "थ्", key)
        key = regex.sub("द्ध्", "ध्", key)
        key = regex.sub("ड्ढ्", "ढ्", key)
        key = regex.sub("प्फ्", "फ्", key)
        key = regex.sub("ब्भ्", "भ्", key)
        return key
    else:
        logging.warning("got script {} for '{}'".format(encoding_scheme, text))
        return regex.sub("\\s", "", text)
Source code for indic_transliteration.deduplication

indic_transliteration

Navigation

Related Topics