Source code for sanskrit.analyze

# -*- coding: utf-8 -*-
"""
    sanskrit.analyze
    ~~~~~~~~~~~~~~~~

    Code for *analyzing* Sanskrit forms, i.e. finding the basic lexical
    forms that produced them and specifying the word's inflectional
    information.

    :license: MIT
"""

from collections import defaultdict, namedtuple

from . import sounds, util
from .schema import *


Ending = namedtuple('Ending', ['name', 'length', 'stem_type', 'gender_id',
                               'case_id', 'number_id', 'compounded',
                               'is_consonant_stem'])


class Analyzer(object):

    """analyzer"""

    def __init__(self):
        raise NotImplementedError

    def analyze(self, token):
        raise NotImplementedError


[docs]class SimpleAnalyzer(Analyzer): """A simple analyzer for Sanskrit words. The analyzer is simple for a few reasons: - It doesn't do any caching. - It uses an ORM instead of raw SQL queries. - Its output is always "well-formed." For example, neuter nouns can take only neuter endings. This analyzer is best used when memory is at a premium and speed is a secondary concern (e.g. when on a web server). """ def __init__(self, ctx): self.ctx = ctx self.session = ctx.session self.nominal_endings = util.HashTrie() for e in self.session.query(NominalEnding): stem_type = e.stem_type is_cons = (stem_type == NominalEnding.CONSONANT_STEM_TYPE) data = { 'name': e.name, 'stem_type': e.stem_type, 'length': len(e.name), 'gender_id': e.gender_id, 'case_id': e.case_id, 'number_id': e.number_id, 'compounded': e.compounded, 'is_consonant_stem': is_cons, } self.nominal_endings[e.name[::-1]] = Ending(**data) if 'n' in e.name: # TODO: do this more rigorously reversed_name = e.name.replace('n', 'R') data['name'] = reversed_name self.nominal_endings[reversed_name[::-1]] = Ending(**data) self.session.remove() def _analyze_as_form(self, word): """ Analyze a word by searching for an exact match in the database. :param word: the word to analyze """ session = self.session results = session.query(Form).filter(Form.name == word).all() return results def _analyze_as_stem(self, word): """ Analyze a word by searching for the nominal stems that might have produced it. :param word: the word to analyze """ session = self.session gender_set = self.ctx.gender_set returned = [] # Find all stems that could produce this word. Some of these # stems might not exist. stem_endings_map = defaultdict(set) endings = self.nominal_endings[word[::-1]] for e in endings: truncated_stem = word[:-e.length] or word if e.is_consonant_stem: # Stem must exist and end in a consonant. if not truncated_stem: continue if truncated_stem[-1] in sounds.VOWELS: continue if truncated_stem in sounds.CONSONANTS: continue stem = truncated_stem else: stem = truncated_stem + e.stem_type stem_endings_map[stem].add(e) if not stem_endings_map: return [] # Check which of these stems are viable stems = session.query(Stem) \ .filter(Stem.name.in_(stem_endings_map.keys())) # Reattach endings to viable stems for stem in stems: name = stem.name # For nouns, disregard endings that don't match the stem's # genders. # TODO: fix semantics of this if stem.pos_id == Tag.NOMINAL: stem_genders = gender_set[stem.genders_id] endings = (e for e in stem_endings_map[name] if e.gender_id in stem_genders) else: endings = stem_endings_map[name] for e in endings: datum = { 'name': word, 'pos_id': stem.pos_id, 'stem': stem, 'gender_id': e.gender_id, 'case_id': e.case_id, 'number_id': e.number_id, 'compounded': e.compounded, } returned.append(Nominal(**datum)) return returned
[docs] def analyze(self, word): """Return all possible solutions for the given word. Any ORM objects used in these solutions will be in a detached state. :param word: the word to analyze. This should be a complete word, or what Panini would call a *pada*. """ returned = self._analyze_as_form(word) returned.extend(self._analyze_as_stem(word)) return returned