Source code for sanskrit.sandhi

# -*- coding: utf-8 -*-
"""
    sanskrit.sandhi
    ~~~~~~~~~~~~~~~

    Classes that apply and undo sandhi rules.

    :license: MIT
"""

from . import sounds
from .util import HashTrie


[docs]class Exempt(unicode):

    """A helper class for marking strings as exempt from sandhi changes. To
    mark a string as exempt, just do the following::

        original = 'amI'
        exempt = Exempt('amI')

    :class:`Exempt` is a subclass of :class:`unicode`, so you can use normal
    string methods on :class:`Exempt` objects.
    """


class SandhiObject(object):

    def add_rules(self, rules):
        raise NotImplementedError


[docs]class Joiner(SandhiObject):

    """Joins multiple Sanskrit terms by applying sandhi rules."""

    def __init__(self, rules=None):
        self.data = {}
        if rules:
            self.add_rules(rules)

[docs]    def add_rules(self, rules):
        """Add rules for joining words.

        Example usage::

            joiner.add_rules[('a', 'i', 'e'), ('a', 'a', 'A'])

        :param rules: a list of 3-tuples, each of which contains:

        - the first part of the combination
        - the second part of the combination
        - the result

        """
        self.data = {}
        for first, second, result in rules:
            self.data[(first, second)] = result

    @staticmethod
[docs]    def internal_retroflex(term):
        """Apply the "n -> ṇ" and "s -> ṣ" rules of internal sandhi.

        :param term: the string to process
        """
        # causes "s" retroflexion
        s_trigger = set('iIuUfFeEoOkr')
        # causes "n" retroflexion
        n_trigger = set('fFrz')
        # Allowed after n_trigger
        n_between = sounds.VOWELS.union('kKgGNpPbBmhvyM')
        # Must appear after the retroflexed "n"
        n_after = sounds.VOWELS.union('myvn')
        # Defines t retroflexion
        retroflexion_dict = dict(zip('tT', 'wW'))

        letters = list(term)

        apply_s = False
        apply_n = False
        had_n = False  # Used for double retroflexion ('nisanna' -> 'nizaRRa')
        had_s = False  # Used for 't' retroflexion
        for i, L in enumerate(letters[:-1]):
            # "t" retroflexion after "s" retroflexion
            if had_s:
                had_s = False
                letters[i] = retroflexion_dict.get(L, L)

            # "s" retroflexion
            if apply_s and L == 's':
                letters[i] = L = 'z'
                had_s = True
            apply_s = L in s_trigger

            # "n" retroflexion
            if had_n and L == 'n':
                letters[i] = 'R'
                had_n = False
            elif apply_n and L == 'n' and letters[i + 1] in n_after:
                letters[i] = 'R'
                had_n = True
            if L in n_trigger:
                apply_n = True
            else:
                apply_n = apply_n and L in n_between

        return ''.join(letters)

[docs]    def join(self, chunks, internal=False):
        """Join the given chunks according to the object's rules::

            assert 'tasyecCA' == s.join('tasya', 'icCA')

        :meth:`join` does **not** take pragṛhya rules into account. As a
        reminder, the main exception are:

            | 1.1.11 "ī", "ū", and "e" when they end words in the dual.
            | 1.1.12 the same vowels after the "m" of adas;
            | 1.1.13 particles with just one vowel, apart from "ā"
            | 1.1.14 particles that end in "o".

        One simple way to account for these rules is to wrap exempt strings
        with :class:`Exempt`::

            assert joiner.join('te', 'iti') == 'ta iti'
            assert joiner.join(Exempt('te'), 'iti') == 'te iti'

        :param chunks: a list of the strings that should be joined
        :param internal: if true, join words using the empty string instead of
                         `' '`.
        """
        separator = '' if internal else ' '

        it = iter(chunks)
        returned = next(it)
        for chunk in it:
            if not chunk:
                continue
            if isinstance(returned, Exempt):
                returned += separator + chunk
            else:
                # `i` controls the number of letters to grab from the end of
                # the first word. For most rules, one letter is sufficient.
                # But visarga sandhi needs slightly more context.
                for i in (2, 1, 0):
                    if not i:
                        returned += separator + chunk
                        break
                    key = (returned[-i:], chunk[0])
                    result = self.data.get(key, None)
                    if result:
                        returned = returned[:-i] + result + chunk[1:]
                        break
            if isinstance(chunk, Exempt):
                returned = Exempt(returned)

        if internal:
            return Joiner.internal_retroflex(returned)
        else:
            return returned


[docs]class Splitter(object):

    """Splits Sanskrit terms by undoing sandhi rules."""

    def __init__(self, rules=None):
        """"""
        self.data = HashTrie()
        if rules:
            self.add_rules(rules)

    def add_rules(self, rules):
        for first, second, result in rules:
            result = result.replace(' ', '')
            items = (first, second, result, len(first), len(second),
                     len(result))
            self.data[result] = items

[docs]    def iter_splits(self, chunk):
        """Return a generator for all splits in `chunk`. Results are yielded
        as 2-tuples containing the term before the split and the term after::

            for item in s.splits('nareti'):
                before, after = item

        :meth:`splits` will generate many false positives, usually when the
        first part of the split ends in an invalid consonant::

            assert ('narAv', 'iti') in s.splits('narAviti')

        These should be filtered out in the calling function.

        Splits are generated from left to right, but the function makes no
        guarantees on when certain rules are applied. That is, output is
        loosely ordered but nondeterministic.
        """

        chunk_len = len(chunk)

        for i in xrange(chunk_len):
            # Default split: chop the chunk in half with no other changes.
            # This can yield a lot of false positives.
            chunk1, chunk2 = chunk[:i], chunk[i:]
            if i:
                yield (chunk1, chunk2)

            # Rule-based splits: undo a sandhi change
            rules = self.data[chunk2]
            for first, second, result, _, _, len_result in rules:
                before = chunk1 + first
                after = second + chunk2[len_result:]
                yield (before, after)

        # Non-split: yield the chunk as-is.
        yield (chunk, '')