Source code for sanskrit.sandhi

# -*- coding: utf-8 -*-
"""
    sanskrit.sandhi
    ~~~~~~~~~~~~~~~

    Classes that apply and undo sandhi rules.

    :license: MIT
"""

from . import sounds
from .util import HashTrie


[docs]class Exempt(unicode): """A helper class for marking strings as exempt from sandhi changes. To mark a string as exempt, just do the following:: original = 'amI' exempt = Exempt('amI') :class:`Exempt` is a subclass of :class:`unicode`, so you can use normal string methods on :class:`Exempt` objects. """
class SandhiObject(object): def add_rules(self, rules): raise NotImplementedError
[docs]class Joiner(SandhiObject): """Joins multiple Sanskrit terms by applying sandhi rules.""" def __init__(self, rules=None): self.data = {} if rules: self.add_rules(rules)
[docs] def add_rules(self, rules): """Add rules for joining words. Example usage:: joiner.add_rules[('a', 'i', 'e'), ('a', 'a', 'A']) :param rules: a list of 3-tuples, each of which contains: - the first part of the combination - the second part of the combination - the result """ self.data = {} for first, second, result in rules: self.data[(first, second)] = result
@staticmethod
[docs] def internal_retroflex(term): """Apply the "n -> ṇ" and "s -> ṣ" rules of internal sandhi. :param term: the string to process """ # causes "s" retroflexion s_trigger = set('iIuUfFeEoOkr') # causes "n" retroflexion n_trigger = set('fFrz') # Allowed after n_trigger n_between = sounds.VOWELS.union('kKgGNpPbBmhvyM') # Must appear after the retroflexed "n" n_after = sounds.VOWELS.union('myvn') # Defines t retroflexion retroflexion_dict = dict(zip('tT', 'wW')) letters = list(term) apply_s = False apply_n = False had_n = False # Used for double retroflexion ('nisanna' -> 'nizaRRa') had_s = False # Used for 't' retroflexion for i, L in enumerate(letters[:-1]): # "t" retroflexion after "s" retroflexion if had_s: had_s = False letters[i] = retroflexion_dict.get(L, L) # "s" retroflexion if apply_s and L == 's': letters[i] = L = 'z' had_s = True apply_s = L in s_trigger # "n" retroflexion if had_n and L == 'n': letters[i] = 'R' had_n = False elif apply_n and L == 'n' and letters[i + 1] in n_after: letters[i] = 'R' had_n = True if L in n_trigger: apply_n = True else: apply_n = apply_n and L in n_between return ''.join(letters)
[docs] def join(self, chunks, internal=False): """Join the given chunks according to the object's rules:: assert 'tasyecCA' == s.join('tasya', 'icCA') :meth:`join` does **not** take pragṛhya rules into account. As a reminder, the main exception are: | 1.1.11 "ī", "ū", and "e" when they end words in the dual. | 1.1.12 the same vowels after the "m" of adas; | 1.1.13 particles with just one vowel, apart from "ā" | 1.1.14 particles that end in "o". One simple way to account for these rules is to wrap exempt strings with :class:`Exempt`:: assert joiner.join('te', 'iti') == 'ta iti' assert joiner.join(Exempt('te'), 'iti') == 'te iti' :param chunks: a list of the strings that should be joined :param internal: if true, join words using the empty string instead of `' '`. """ separator = '' if internal else ' ' it = iter(chunks) returned = next(it) for chunk in it: if not chunk: continue if isinstance(returned, Exempt): returned += separator + chunk else: # `i` controls the number of letters to grab from the end of # the first word. For most rules, one letter is sufficient. # But visarga sandhi needs slightly more context. for i in (2, 1, 0): if not i: returned += separator + chunk break key = (returned[-i:], chunk[0]) result = self.data.get(key, None) if result: returned = returned[:-i] + result + chunk[1:] break if isinstance(chunk, Exempt): returned = Exempt(returned) if internal: return Joiner.internal_retroflex(returned) else: return returned
[docs]class Splitter(object): """Splits Sanskrit terms by undoing sandhi rules.""" def __init__(self, rules=None): """""" self.data = HashTrie() if rules: self.add_rules(rules) def add_rules(self, rules): for first, second, result in rules: result = result.replace(' ', '') items = (first, second, result, len(first), len(second), len(result)) self.data[result] = items
[docs] def iter_splits(self, chunk): """Return a generator for all splits in `chunk`. Results are yielded as 2-tuples containing the term before the split and the term after:: for item in s.splits('nareti'): before, after = item :meth:`splits` will generate many false positives, usually when the first part of the split ends in an invalid consonant:: assert ('narAv', 'iti') in s.splits('narAviti') These should be filtered out in the calling function. Splits are generated from left to right, but the function makes no guarantees on when certain rules are applied. That is, output is loosely ordered but nondeterministic. """ chunk_len = len(chunk) for i in xrange(chunk_len): # Default split: chop the chunk in half with no other changes. # This can yield a lot of false positives. chunk1, chunk2 = chunk[:i], chunk[i:] if i: yield (chunk1, chunk2) # Rule-based splits: undo a sandhi change rules = self.data[chunk2] for first, second, result, _, _, len_result in rules: before = chunk1 + first after = second + chunk2[len_result:] yield (before, after) # Non-split: yield the chunk as-is. yield (chunk, '')