Source code for sanskrit.tagger

"""
    sanskrit.tagger
    ~~~~~~~~~~~~~~~
    Code for converting Sanskrit paragraphs into a list of linguistic
    forms. This is done through a *part-of-speech tagger*
    (:class:`~sanskrit.tagger.Tagger`) that also removes sandhi and
    identifies the lexical roots that underlie the forms in some passage.

    When tagging, a block of text (i.e. a verse or paragraph) is called
    a *segment* and its space-separated substrings are called *chunks*.
    For example, the segment ``'aTa SabdAnuSAsanam |'`` has chunks
    ``'aTa'``, ``'SabdAnuSAsanam'``, and ``'|'``.

    :license: MIT
"""

from sanskrit import analyze, models, sandhi, schema, util


[docs]class NonForm: """Wraps a chunk that couldn't be parsed by the :class:`Tagger`.""" def __init__(self, name): self.name = name def __repr__(self): return "NonForm('{}')".format(self.name)
[docs]class TaggedItem: """Associates a linguistic form with a specific chunk and segment.""" def __init__(self, segment_id, chunk_index, form): self.segment_id = segment_id self.chunk_index = chunk_index self.form = form def __repr__(self): fields = (self.segment_id, self.chunk_index, self.form.name) return 'TaggedItem({})'.format(fields) def _enum_string(self, ctx, fields): strings = [] enums = ctx.enum_abbr for field in fields: id = getattr(self.form, field + '_id') if id is not None: strings.append(enums[field][id]) else: strings.append('') return '-'.join(strings) def tag(self, ctx): form = self.form if isinstance(form, NonForm): return models.SEQUENCE_BOUNDARY if isinstance(form, schema.Indeclinable): tup = ('indeclinable',) elif isinstance(form, schema.Verb): tup = ('verb', self._enum_string(ctx, ['person', 'number'])) elif isinstance(form, schema.Nominal): tup = ('nominal', self._enum_string(ctx, ['gender', 'case', 'number'])) elif isinstance(form, schema.Infinitive): tup = ('infinitive',) elif isinstance(form, schema.Gerund): tup = ('gerund',) elif isinstance(form, schema.PerfectIndeclinable): tup = ('perfect-indeclinable',) return '-'.join(tup) def human_readable_form(self, ctx): form = self.form if isinstance(form, NonForm): return (form.name, '', '', '') elif isinstance(form, schema.Indeclinable): return (form.name, 'indeclinable', '', '') elif isinstance(form, schema.Verb): return (form.name, 'verb', form.root.name, self._enum_string(ctx, ['vclass', 'person', 'number', 'mode', 'voice'])) elif isinstance(form, schema.Nominal): return (form.name, 'nominal', form.stem.name, self._enum_string(ctx, ['gender', 'case', 'number'])) elif isinstance(form, schema.Infinitive): return (form.name, 'infinitive', form.root.name, '') elif isinstance(form, schema.Gerund): return (form.name, 'gerund', form.root.name, '') elif isinstance(form, schema.PerfectIndeclinable): return (form.name, 'perfect-indeclinable', form.root.name, '')
[docs]class Tagger: """The part-of-speech tagger.""" def __init__(self, ctx): rules = [(x.first, x.second, x.result) for x in ctx.session.query(schema.SandhiRule).all()] self.ctx = ctx self.splitter = sandhi.Splitter(rules) self.analyzer = analyze.SimpleAnalyzer(ctx) self.model = models.FeatureModel() def _score(self, before, cur, remainder): """Compute a score over the given tagger state.""" # xs = [before[-1].tag(self.ctx)] if before else [] # y = cur.tag(self.ctx) # return self.model.log_cond_prob(xs, y) return self.model.score(cur, remainder)
[docs] def iter_chunks(self, segment): """Iterate over the chunks in `segment`. :param segment: an arbitrary string """ for line in segment.splitlines(): for chunk in line.split(): yield chunk
[docs] def tag(self, segment, segment_id=None): """Return the linguistic forms that compose `segment`. If a form can't be parsed, it's wrapped in :class:`NonForm`. :param segment: an arbitrary string :return: a list of :class:`TaggedItem` objects. """ chunks = list(self.iter_chunks(segment)) if not chunks: return q = util.PriorityQueue() q.push(([], 0, chunks[0]), 0) done = [] while q: (done, chunk_index, remainder), priority = q.pop_with_priority() # Chunk is done if not remainder: if chunk_index + 1 < len(chunks): new_state = (done, chunk_index + 1, chunks[chunk_index + 1]) q.push(new_state, priority) continue else: # Segment is done! break for before, after in self.splitter.iter_splits(remainder): # Without this line, the tagger could loop forever. This # looping occurs if a sandhi rule has the form "X -> Y X", # which yields Y while leaving the term with X unchanged. if remainder == after: continue results = self.analyzer.analyze(before) for result in results: item = TaggedItem(segment_id, chunk_index, result) q.push((done + [item], chunk_index, after), priority + self._score(done, item, after)) # Add "default" state in case nothing could be found. if remainder == chunks[chunk_index]: result = NonForm(remainder) item = TaggedItem(segment_id, chunk_index, result) new_state = (done + [item], chunk_index, None) q.push(new_state, priority + self._score(done, item, remainder)) return done