Add tags parsing

This commit is contained in:
2026-01-07 21:21:31 +01:00
parent 80ea2ba07f
commit 34b1fbd648

View File

@@ -6,8 +6,7 @@ async function analyse(input) {
const linePattern = /^\s?(?<opening>\[)?(?<start>\d+),(?<end>\d+),(?<segment>,|(.+?)),(?<lemma>,|(.+?)),(?<tags>[\w:.]+),(?<frequency>_|(.+?)),(?<qualifiers>_|(.+?))(?<ending>\])?$/;
const output = [];
let segment = undefined;
let inSegment = false;
let segment = undefined;
for (const line of stdout.toString().split(/\r?\n|\r/)) {
if (line.trim().length === 0) {
@@ -27,7 +26,7 @@ async function analyse(input) {
segment = [];
}
segment.push(data);
segment.push({ ...data, tags: parseTags(data.tags) });
if (ending) {
inSegment = false;
@@ -63,7 +62,8 @@ async function invokeConcraftRemotely(dag) {
async function desambiguate(analysis) {
const input = analysis
.flatMap(a => a)
.map(a => ({ ...a, tags: a.tags.split(":").map(t => t.split(".")[0]).join(":") }))
.map(a => ({ ...a, tags: Object.keys(a.tags).map(key => key === 'case' ? a.tags[key].split(".")[0] : a.tags[key]).join(":") }))
// .map(a => ({ ...a, tags: a.tags.split(":").map(t => t.split(".")[0]).join(":") }))
.map(a => `${[...Object.values(a), "0.0", "_", "_", "_"].join("\t")}`)
.join("\n");
@@ -84,7 +84,8 @@ async function desambiguate(analysis) {
element.end = Number.parseInt(element.end);
element.disamb = element.disamb === 'disamb';
element.eos = element.eos === 'eos';
element.prob = Number.parseFloat(element.prob);
element.prob = Number.parseFloat(element.prob);
element.tags = parseTags(element.tags);
output.push(element);
}
@@ -137,6 +138,42 @@ function generate(input, ...tags) {
return output.flatMap(a => a).filter(entry => tags.every(tag => entry.tags.includes(tag)));
}
function parseTags(tags) {
const [type, ...rest] = tags.split(":");
const parsers = {
adv: ([degree]) => ({ degree }),
imps: (aspect) => ({ aspect }),
inf: (aspect) => ({ aspect }),
pant: (aspect) => ({ aspect }),
pcon: (aspect) => ({ aspect }),
qub: ([vocalicity]) => ({ vocalicity }),
prep: (c, vocalicity) => ({ case: c, vocalicity }),
siebie: (c) => ({ case: c }),
subst: (number, c, gender,) => ({ number, case: c, gender }),
depr: (number, c, gender,) => ({ number, case: c, gender }),
ger: (number, c, gender, aspect, negation) => ({ number, case: c, gender, aspect, negation }),
ppron12: (number, c, gender, person, accentability) => ({ number, case: c, gender, person, accentability }),
ppron3: (number, c, gender, person, accentability, postprepositionality) => ({ number, case: c, gender, person, accentability, postprepositionality }),
num: (number, c, gender, accommodability) => ({ number, case: c, gender, accommodability }),
numcol: (number, c, gender, accommodability) => ({ number, case: c, gender, accommodability }),
adj: (number, c, gender, degree) => ({ number, case: c, gender, degree }),
pact: (number, c, gender, aspect, negation) => ({ number, case: c, gender, aspect, negation }),
ppas: (number, c, gender, aspect, negation) => ({ number, case: c, gender, aspect, negation }),
winien: (number, gender, aspect,) => ({ number, gender, aspect }),
praet: (number, gender, aspect, [agglutination]) => ({ number, gender, aspect, agglutination }),
bedzie: (number, person, aspect,) => ({ number, person, aspect }),
fin: (number, person, aspect,) => ({ number, person, aspect }),
impt: (number, person, aspect,) => ({ number, person, aspect }),
aglt: (number, person, aspect, vocalicity) => ({ number, person, aspect, vocalicity }),
};
return {
type,
... (parsers[type]?.(...rest) ?? {})
}
}
async function main([ interpreter, script, action, ...args ]) {
const fn = {
analyse,