Add support for parsing output
This commit is contained in:
@@ -1,8 +1,70 @@
|
||||
import morfeusz2
|
||||
from conmorfeusz import concraft
|
||||
from typing import Dict, List
|
||||
|
||||
_TAG_SPECS: Dict[str, List[str]] = {
|
||||
"adv": ["degree"],
|
||||
"imps": ["aspect"],
|
||||
"inf": ["aspect"],
|
||||
"pant": ["aspect"],
|
||||
"pcon": ["aspect"],
|
||||
"qub": ["vocalicity"],
|
||||
"prep": ["case", "vocalicity"],
|
||||
"siebie": ["case"],
|
||||
"subst": ["number", "case", "gender"],
|
||||
"depr": ["number", "case", "gender"],
|
||||
"ger": ["number", "case", "gender", "aspect", "negation"],
|
||||
"ppron12": ["number", "case", "gender", "person", "accentability"],
|
||||
"ppron3": [
|
||||
"number",
|
||||
"case",
|
||||
"gender",
|
||||
"person",
|
||||
"accentability",
|
||||
"postprepositionality",
|
||||
],
|
||||
"num": ["number", "case", "gender", "accommodability"],
|
||||
"numcol": ["number", "case", "gender", "accommodability"],
|
||||
"adj": ["number", "case", "gender", "degree"],
|
||||
"pact": ["number", "case", "gender", "aspect", "negation"],
|
||||
"ppas": ["number", "case", "gender", "aspect", "negation"],
|
||||
"winien": ["number", "gender", "aspect"],
|
||||
"praet": ["number", "gender", "aspect", "agglutination"],
|
||||
"bedzie": ["number", "person", "aspect"],
|
||||
"fin": ["number", "person", "aspect"],
|
||||
"impt": ["number", "person", "aspect"],
|
||||
"aglt": ["number", "person", "aspect", "vocalicity"],
|
||||
}
|
||||
|
||||
|
||||
def parse_tags(tags: str) -> Dict[str, str | None]:
|
||||
type_, *rest = tags.split(":")
|
||||
fields = _TAG_SPECS.get(type_, [])
|
||||
padded_values = rest + [None] * (len(fields) - len(rest))
|
||||
parsed = {field: value for field, value in zip(fields, padded_values)}
|
||||
|
||||
return {"type": type_, **parsed}
|
||||
|
||||
def entry_to_dict(entry):
|
||||
start, end, morf, prob, eos, disamb = entry
|
||||
segment, lemma, tags, frequency, qualifiers = morf
|
||||
|
||||
return {
|
||||
"start": start,
|
||||
"end": end,
|
||||
"segment": segment,
|
||||
"lemma": lemma,
|
||||
"tags": parse_tags(tags),
|
||||
"frequency": frequency,
|
||||
"qualifiers": qualifiers,
|
||||
"prob": float(prob),
|
||||
"eos": eos == "eos",
|
||||
"disamb": disamb == "disamb"
|
||||
}
|
||||
|
||||
|
||||
def analyse(text):
|
||||
morf = morfeusz2.Morfeusz(expand_tags=True)
|
||||
conc = concraft.Concraft()
|
||||
analysis = morf.analyse(text)
|
||||
return conc.disamb(analysis)
|
||||
return [entry_to_dict(e) for e in conc.disamb(analysis) if len(e) == 6]
|
||||
Reference in New Issue
Block a user