From 04eee534b7d218b311ced917f1fa2bd007b755eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Pluta?= Date: Wed, 7 Jan 2026 16:01:09 +0100 Subject: [PATCH] Add conmorfeusz as synergy of morfeusz and concraft-pl --- conmorfeusz.nix | 38 ++++++++++++ flake.lock | 61 +++++++++++++++++++ flake.nix | 1 + index.js | 157 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 257 insertions(+) create mode 100644 conmorfeusz.nix create mode 100644 flake.lock create mode 100755 index.js diff --git a/conmorfeusz.nix b/conmorfeusz.nix new file mode 100644 index 0000000..31252e0 --- /dev/null +++ b/conmorfeusz.nix @@ -0,0 +1,38 @@ +{ + self, + stdenv, + lib, + writers, + nodejs, + fetchurl, + model ? "", + serverUrl ? "", + local ? true, + ... +}: let + sgjpModel = fetchurl { + url = "https://zil.ipipan.waw.pl/Concraft?action=AttachFile&do=get&target=concraft-pl-model-SGJP-20220221.gz"; + hash = "sha256-VcvdSkJwUhAgHroA0d/bH3QDjjO/2x8HqSuUvRgIN/4="; + }; + + targetModel = + if model == "" + then sgjpModel + else model; +in + writers.makeScriptWriter {interpreter = "${nodejs}/bin/node";} "conmorfeusz" '' + // Passed from Nix derivation config + const MORFEUSZ_ANALYSER_BIN="${self.packages.${stdenv.hostPlatform.system}.morfeusz}/bin/morfeusz_analyzer"; + const MORFEUSZ_GENERATOR_BIN="${self.packages.${stdenv.hostPlatform.system}.morfeusz}/bin/morfeusz_generator"; + const CONCRAFT_BIN="${self.packages.${stdenv.hostPlatform.system}.concraft-pl}/bin/concraft-pl"; + const CONCRAFT_MODEL="${targetModel}"; + const CONCRAFT_SERVER_URL="${serverUrl}"; + const CONCRAFT_MODE="${ + if local + then "local" + else "remote" + }"; + // ================================= + + ${builtins.readFile ./index.js} + '' diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..8fbe5c2 --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1767640445, + "narHash": "sha256-UWYqmD7JFBEDBHWYcqE6s6c77pWdcU/i+bwD6XxMb8A=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "9f0c42f8bc7151b8e7e5840fb3bd454ad850d8c5", + "type": "github" + }, + "original": { + "owner": "nixos", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix index 7883566..008b06f 100644 --- a/flake.nix +++ b/flake.nix @@ -17,6 +17,7 @@ packages = rec { morfeusz = pkgs.callPackage ./morfeusz.nix {}; concraft-pl = pkgs.callPackage ./concraft-pl.nix {}; + conmorfeusz = pkgs.callPackage ./conmorfeusz.nix {inherit self;}; }; }) // { diff --git a/index.js b/index.js new file mode 100755 index 0000000..792626d --- /dev/null +++ b/index.js @@ -0,0 +1,157 @@ +const { spawnSync } = require("child_process"); + +async function analyse(input) { + const { stdout } = spawnSync(MORFEUSZ_ANALYSER_BIN, [], { input }); + + const linePattern = /^\s?(?\[)?(?\d+),(?\d+),(?,|(.+?)),(?,|(.+?)),(?[\w:.]+),(?_|(.+?)),(?_|(.+?))(?\])?$/; + + const output = []; + let segment = undefined; + let inSegment = false; + + for (const line of stdout.toString().split(/\r?\n|\r/)) { + if (line.trim().length === 0) { + continue; + } + + const match = line.match(linePattern); + + if (!match) { + throw new Error(`Following line does not match the pattern: ${line}`); + } + + const { opening, ending, ...data } = match.groups; + + if (opening) { + inSegment = true; + segment = []; + } + + segment.push(data); + + if (ending) { + inSegment = false; + output.push(segment); + segment = undefined; + } + } + + return output; +} + +async function runConcraftLocally(input) { + const { stdout, stderr } = spawnSync(CONCRAFT_BIN, ["tag", CONCRAFT_MODEL], { input }); + + return stdout.toString() +} + +async function invokeConcraftRemotely(dag) { + const response = await fetch(`${CONCRAFT_SERVER_URL}/parse`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json' + }, + body: JSON.stringify({ + dag + }) + }); + + const data = await response.json(); + return data.dag; +} + +async function desambiguate(analysis) { + const input = analysis + .flatMap(a => a) + .map(a => ({ ...a, tags: a.tags.split(":").map(t => t.split(".")[0]).join(":") })) + .map(a => `${[...Object.values(a), "0.0", "_", "_", "_"].join("\t")}`) + .join("\n"); + + + const response = await (CONCRAFT_MODE === 'local' ? runConcraftLocally : invokeConcraftRemotely)(input); + + const keys = ["start", "end", "segment", "lemma", "tags", "frequency", "qualifiers", "prob", "interp_meta", "eos", "seg_meta", "disamb"]; + const output = []; + for (const line of response.split(/\r?\n|\r/)) { + if (line.trim().length === 0) { + continue; + } + + const properties = line.split(/\t/).map((prop, index) => [keys[index], prop === '_' ? '' : prop]); + const element = Object.fromEntries(properties); + + element.start = Number.parseInt(element.start); + element.end = Number.parseInt(element.end); + element.disamb = element.disamb === 'disamb'; + element.eos = element.eos === 'eos'; + element.prob = Number.parseFloat(element.prob); + + output.push(element); + } + + return output; +} + +async function danalyse(input) { + const analysis = await analyse(input); + return await desambiguate(analysis); +} + +function generate(text, ...tags) { + const { stdout } = spawnSync(MORFEUSZ_GENERATOR_BIN, [], { input }); + + const linePattern = /^\s?(?\[)?(?,|(.+?)),(?,|(.+?)),(?[\w:.]+),(?_|(.+?)),(?_|(.+?))(?\])?$/; + + const output = []; + + let segment = undefined; + let inSegment = false; + + for (const line of stdout.toString().split(/\r?\n|\r/)) { + if (line.trim().length === 0) { + continue; + } + + const match = line.match(linePattern); + + if (!match) { + throw new Error(`Following line does not match the pattern: ${line}`); + } + + const { opening, ending, ...data } = match.groups; + + if (opening) { + inSegment = true; + segment = []; + } + + segment.push(data); + + if (ending) { + inSegment = false; + output.push(segment); + segment = undefined; + } + } + + return output.flatMap(a => a).filter(entry => tags.every(tag => entry.tags.includes(tag))); +} + +async function main([ interpreter, script, action, ...args ]) { + const fn = { + analyse, + danalyse, + generate + }[action]; + + if (!fn) { + throw new Error(`Unknown action: '${action}'`); + } + + const output = await fn(...args); + + console.log(JSON.stringify(output, undefined, 2)); +} + + +main(process.argv);