# -*- coding: utf-8 -*-
"""
sanskrit.analyze
~~~~~~~~~~~~~~~~
Code for *analyzing* Sanskrit forms, i.e. finding the basic lexical
forms that produced them and specifying the word's inflectional
information.
:license: MIT
"""
from collections import defaultdict, namedtuple
from . import sounds, util
from .schema import *
Ending = namedtuple('Ending', ['name', 'length', 'stem_type', 'gender_id',
'case_id', 'number_id', 'compounded',
'is_consonant_stem'])
class Analyzer(object):
"""analyzer"""
def __init__(self):
raise NotImplementedError
def analyze(self, token):
raise NotImplementedError
[docs]class SimpleAnalyzer(Analyzer):
"""A simple analyzer for Sanskrit words. The analyzer is simple
for a few reasons:
- It doesn't do any caching.
- It uses an ORM instead of raw SQL queries.
- Its output is always "well-formed." For example, neuter nouns can
take only neuter endings.
This analyzer is best used when memory is at a premium and speed is
a secondary concern (e.g. when on a web server).
"""
def __init__(self, ctx):
self.ctx = ctx
self.session = ctx.session
self.nominal_endings = util.HashTrie()
for e in self.session.query(NominalEnding):
stem_type = e.stem_type
is_cons = (stem_type == NominalEnding.CONSONANT_STEM_TYPE)
data = {
'name': e.name,
'stem_type': e.stem_type,
'length': len(e.name),
'gender_id': e.gender_id,
'case_id': e.case_id,
'number_id': e.number_id,
'compounded': e.compounded,
'is_consonant_stem': is_cons,
}
self.nominal_endings[e.name[::-1]] = Ending(**data)
if 'n' in e.name:
# TODO: do this more rigorously
reversed_name = e.name.replace('n', 'R')
data['name'] = reversed_name
self.nominal_endings[reversed_name[::-1]] = Ending(**data)
self.session.remove()
def _analyze_as_form(self, word):
"""
Analyze a word by searching for an exact match in the database.
:param word: the word to analyze
"""
session = self.session
results = session.query(Form).filter(Form.name == word).all()
return results
def _analyze_as_stem(self, word):
"""
Analyze a word by searching for the nominal stems that might
have produced it.
:param word: the word to analyze
"""
session = self.session
gender_set = self.ctx.gender_set
returned = []
# Find all stems that could produce this word. Some of these
# stems might not exist.
stem_endings_map = defaultdict(set)
endings = self.nominal_endings[word[::-1]]
for e in endings:
truncated_stem = word[:-e.length] or word
if e.is_consonant_stem:
# Stem must exist and end in a consonant.
if not truncated_stem:
continue
if truncated_stem[-1] in sounds.VOWELS:
continue
if truncated_stem in sounds.CONSONANTS:
continue
stem = truncated_stem
else:
stem = truncated_stem + e.stem_type
stem_endings_map[stem].add(e)
if not stem_endings_map:
return []
# Check which of these stems are viable
stems = session.query(Stem) \
.filter(Stem.name.in_(stem_endings_map.keys()))
# Reattach endings to viable stems
for stem in stems:
name = stem.name
# For nouns, disregard endings that don't match the stem's
# genders.
# TODO: fix semantics of this
if stem.pos_id == Tag.NOMINAL:
stem_genders = gender_set[stem.genders_id]
endings = (e for e in stem_endings_map[name]
if e.gender_id in stem_genders)
else:
endings = stem_endings_map[name]
for e in endings:
datum = {
'name': word,
'pos_id': stem.pos_id,
'stem': stem,
'gender_id': e.gender_id,
'case_id': e.case_id,
'number_id': e.number_id,
'compounded': e.compounded,
}
returned.append(Nominal(**datum))
return returned
[docs] def analyze(self, word):
"""Return all possible solutions for the given word. Any ORM
objects used in these solutions will be in a detached state.
:param word: the word to analyze. This should be a complete
word, or what Panini would call a *pada*.
"""
returned = self._analyze_as_form(word)
returned.extend(self._analyze_as_stem(word))
return returned