diff --git a/conmorfeusz/conmorfeusz/service/analyzer.py b/conmorfeusz/conmorfeusz/service/analyzer.py index 70a5f5a..4c8a309 100644 --- a/conmorfeusz/conmorfeusz/service/analyzer.py +++ b/conmorfeusz/conmorfeusz/service/analyzer.py @@ -1,8 +1,70 @@ import morfeusz2 from conmorfeusz import concraft +from typing import Dict, List + +_TAG_SPECS: Dict[str, List[str]] = { + "adv": ["degree"], + "imps": ["aspect"], + "inf": ["aspect"], + "pant": ["aspect"], + "pcon": ["aspect"], + "qub": ["vocalicity"], + "prep": ["case", "vocalicity"], + "siebie": ["case"], + "subst": ["number", "case", "gender"], + "depr": ["number", "case", "gender"], + "ger": ["number", "case", "gender", "aspect", "negation"], + "ppron12": ["number", "case", "gender", "person", "accentability"], + "ppron3": [ + "number", + "case", + "gender", + "person", + "accentability", + "postprepositionality", + ], + "num": ["number", "case", "gender", "accommodability"], + "numcol": ["number", "case", "gender", "accommodability"], + "adj": ["number", "case", "gender", "degree"], + "pact": ["number", "case", "gender", "aspect", "negation"], + "ppas": ["number", "case", "gender", "aspect", "negation"], + "winien": ["number", "gender", "aspect"], + "praet": ["number", "gender", "aspect", "agglutination"], + "bedzie": ["number", "person", "aspect"], + "fin": ["number", "person", "aspect"], + "impt": ["number", "person", "aspect"], + "aglt": ["number", "person", "aspect", "vocalicity"], +} + + +def parse_tags(tags: str) -> Dict[str, str | None]: + type_, *rest = tags.split(":") + fields = _TAG_SPECS.get(type_, []) + padded_values = rest + [None] * (len(fields) - len(rest)) + parsed = {field: value for field, value in zip(fields, padded_values)} + + return {"type": type_, **parsed} + +def entry_to_dict(entry): + start, end, morf, prob, eos, disamb = entry + segment, lemma, tags, frequency, qualifiers = morf + + return { + "start": start, + "end": end, + "segment": segment, + "lemma": lemma, + "tags": parse_tags(tags), + "frequency": frequency, + "qualifiers": qualifiers, + "prob": float(prob), + "eos": eos == "eos", + "disamb": disamb == "disamb" + } + def analyse(text): morf = morfeusz2.Morfeusz(expand_tags=True) conc = concraft.Concraft() analysis = morf.analyse(text) - return conc.disamb(analysis) \ No newline at end of file + return [entry_to_dict(e) for e in conc.disamb(analysis) if len(e) == 6] \ No newline at end of file